# This project is the crime project created by Yu Qiushi, il la tout fait!
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import altair as alt 
import holoviews as hv
import seaborn as sns
import hvplot.pandas
import geopandas as gpd
from mpl_toolkits.axes_grid1 import make_axes_locatable
import geoviews as gv
import geoviews.tile_sources as gvts
import folium
import xyzservices
from folium.plugins import TimestampedGeoJson
import osmnx as ox
import pandana as pnda
from pandana.loaders import osm
import requests
from bs4 import BeautifulSoup
%matplotlib inline
from folium import GeoJson
import panel as pn
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore", message="the convert_dtype parameter is deprecated")
philadelphia_range_path = "final_data/city_limits_3857.geojson"
city_limits = gpd.read_file(philadelphia_range_path)

if city_limits.crs.to_string() != "EPSG:4326":
    city_limits = city_limits.to_crs(epsg=4326)

print(city_limits.crs)
EPSG:4326
city_limits.explore()
Make this Notebook Trusted to load map: File -> Trust Notebook
base_url = "https://phl.carto.com/api/v2/sql"
sql_query = """
SELECT * FROM incidents_part1_part2
WHERE dispatch_date >= '2022-01-01' AND dispatch_date <= '2024-12-23'
"""
params = {
    'q': sql_query
}

response = requests.get(base_url, params=params)
data = response.json()

crime = pd.DataFrame(data['rows'])
print(crime.head())

print(len(crime))
   cartodb_id                                           the_geom  \
0           2  0101000020E6100000A51C8299A5C752C006342AD3DCFF...   
1           4  0101000020E6100000F9245E3B64CC52C0B7195D940FF6...   
2           7  0101000020E6100000118A52E7F6C052C0CFF41263190C...   
3         123  0101000020E6100000E1F9FB7B5FC552C0159C0B6D4A02...   
4         126  0101000020E6100000D1CCD5875CCA52C014B723FFC005...   

                                the_geom_webmercator  objectid dc_dist psa  \
0  0101000020110F0000F80DE2A145E65FC1E5EC7592BE8F...       114      25   3   
1  0101000020110F00000426B7CE54EE5FC1C5E06D37E284...       116      01   1   
2  0101000020110F00006728CED7EBDA5FC169DB64F8519D...       119      08   2   
3  0101000020110F00009D28D4D968E25FC13CD5C3D06F92...        96      15   1   
4  0101000020110F00002F28E30AE2EA5FC10090A3314796...        99      14   1   

     dispatch_date_time dispatch_date dispatch_time  hour        dc_key  \
0  2023-03-11T17:12:00Z    2023-03-11      12:12:00  12.0  202325014482   
1  2023-03-11T18:31:00Z    2023-03-11      13:31:00  13.0  202301004597   
2  2023-03-11T22:13:00Z    2023-03-11      17:13:00  17.0  202308008412   
3  2023-03-11T12:42:00Z    2023-03-11      07:42:00   7.0  202315017366   
4  2023-03-12T00:54:00Z    2023-03-11      19:54:00  19.0  202314012625   

              location_block ucr_general   text_general_code    point_x  \
0    3300 BLOCK HARTVILLE ST         300  Robbery No Firearm -75.119482   
1       2400 BLOCK S 28TH ST         600  Theft from Vehicle -75.193618   
2  9800 BLOCK Roosevelt Blvd         600              Thefts -75.015070   
3      4700 BLOCK GRISCOM ST         600              Thefts -75.083953   
4        5500 BLOCK BLOYD ST         300  Robbery No Firearm -75.161898   

     point_y  
0  39.998927  
1  39.922350  
2  40.094525  
3  40.017896  
4  40.044952  
477568
from shapely import wkb

crime['geometry'] = crime['the_geom'].apply(wkb.loads)

crime_gdf = gpd.GeoDataFrame(crime, geometry='geometry', crs="EPSG:4326")

if crime_gdf.crs != city_limits.crs:
    crime_gdf = crime_gdf.to_crs(city_limits.crs)

print("Columns in the crime GeoDataFrame:", crime_gdf.columns.tolist())
print(crime_gdf.head())
Columns in the crime GeoDataFrame: ['cartodb_id', 'the_geom', 'the_geom_webmercator', 'objectid', 'dc_dist', 'psa', 'dispatch_date_time', 'dispatch_date', 'dispatch_time', 'hour', 'dc_key', 'location_block', 'ucr_general', 'text_general_code', 'point_x', 'point_y', 'geometry']
   cartodb_id                                           the_geom  \
0           2  0101000020E6100000A51C8299A5C752C006342AD3DCFF...   
1           4  0101000020E6100000F9245E3B64CC52C0B7195D940FF6...   
2           7  0101000020E6100000118A52E7F6C052C0CFF41263190C...   
3         123  0101000020E6100000E1F9FB7B5FC552C0159C0B6D4A02...   
4         126  0101000020E6100000D1CCD5875CCA52C014B723FFC005...   

                                the_geom_webmercator  objectid dc_dist psa  \
0  0101000020110F0000F80DE2A145E65FC1E5EC7592BE8F...       114      25   3   
1  0101000020110F00000426B7CE54EE5FC1C5E06D37E284...       116      01   1   
2  0101000020110F00006728CED7EBDA5FC169DB64F8519D...       119      08   2   
3  0101000020110F00009D28D4D968E25FC13CD5C3D06F92...        96      15   1   
4  0101000020110F00002F28E30AE2EA5FC10090A3314796...        99      14   1   

     dispatch_date_time dispatch_date dispatch_time  hour        dc_key  \
0  2023-03-11T17:12:00Z    2023-03-11      12:12:00  12.0  202325014482   
1  2023-03-11T18:31:00Z    2023-03-11      13:31:00  13.0  202301004597   
2  2023-03-11T22:13:00Z    2023-03-11      17:13:00  17.0  202308008412   
3  2023-03-11T12:42:00Z    2023-03-11      07:42:00   7.0  202315017366   
4  2023-03-12T00:54:00Z    2023-03-11      19:54:00  19.0  202314012625   

              location_block ucr_general   text_general_code    point_x  \
0    3300 BLOCK HARTVILLE ST         300  Robbery No Firearm -75.119482   
1       2400 BLOCK S 28TH ST         600  Theft from Vehicle -75.193618   
2  9800 BLOCK Roosevelt Blvd         600              Thefts -75.015070   
3      4700 BLOCK GRISCOM ST         600              Thefts -75.083953   
4        5500 BLOCK BLOYD ST         300  Robbery No Firearm -75.161898   

     point_y                    geometry  
0  39.998927  POINT (-75.11948 39.99893)  
1  39.922350  POINT (-75.19362 39.92235)  
2  40.094525  POINT (-75.01507 40.09452)  
3  40.017896  POINT (-75.08395 40.01790)  
4  40.044952  POINT (-75.16190 40.04495)  
hv.extension('bokeh')
crime_gdf['dispatch_date'] = pd.to_datetime(crime_gdf['dispatch_date'])

crime_gdf['year_month'] = crime_gdf['dispatch_date'].dt.to_period('M').astype(str)

crime_monthly = crime_gdf.groupby(['text_general_code', 'year_month']).size().reset_index(name='count')
unique_crimes = crime_gdf['text_general_code'].unique()
print(f"Unique crime types ({len(unique_crimes)}):", unique_crimes)
Unique crime types (32): ['Robbery No Firearm' 'Theft from Vehicle' 'Thefts'
 'Aggravated Assault Firearm' 'Aggravated Assault No Firearm' 'Rape'
 'Burglary Residential' 'Robbery Firearm' 'Burglary Non-Residential'
 'Other Assaults' 'Vandalism/Criminal Mischief'
 'Narcotic / Drug Law Violations' 'Fraud'
 'Offenses Against Family and Children' 'Weapon Violations'
 'All Other Offenses' 'DRIVING UNDER THE INFLUENCE'
 'Other Sex Offenses (Not Commercialized)' 'Receiving Stolen Property'
 'Liquor Law Violations' 'Arson' 'Disorderly Conduct' 'Embezzlement'
 'Prostitution and Commercialized Vice' 'Forgery and Counterfeiting'
 'Public Drunkenness' 'Vagrancy/Loitering' 'Gambling Violations'
 'Motor Vehicle Theft' 'Homicide - Criminal' 'Homicide - Justifiable'
 'Homicide - Gross Negligence']
#here I reclassify the category of crime so we wont have to test so many case later

crime_categories = {
    'Violent Crime': [
        'Robbery No Firearm', 'Robbery Firearm', 'Aggravated Assault Firearm', 
        'Aggravated Assault No Firearm', 'Homicide - Criminal', 'Homicide - Justifiable', 'Homicide - Gross Negligence'
    ],
    'Property Crime': [
        'Theft from Vehicle', 'Thefts', 'Burglary Residential', 'Burglary Non-Residential', 
        'Motor Vehicle Theft', 'Arson', 'Receiving Stolen Property', 'Vandalism/Criminal Mischief'
    ],
    'Drug/Alcohol Crime': [
        'Narcotic / Drug Law Violations', 'DRIVING UNDER THE INFLUENCE', 'Liquor Law Violations', 
        'Public Drunkenness'
    ],
    'Sexual Offenses': [
        'Rape', 'Other Sex Offenses (Not Commercialized)', 'Prostitution and Commercialized Vice'
    ],
    'Miscellaneous': [
        'Fraud', 'Offenses Against Family and Children', 'Weapon Violations', 'Disorderly Conduct', 
        'Embezzlement', 'Forgery and Counterfeiting', 'Vagrancy/Loitering', 'Gambling Violations', 'All Other Offenses'
    ]
}
def map_crime_category(crime_type):
    for category, crimes in crime_categories.items():
        if crime_type in crimes:
            return category
    return 'Other'

crime_gdf['crime_category'] = crime_gdf['text_general_code'].apply(map_crime_category)

crime_monthly = crime_gdf.groupby(['crime_category', 'year_month']).size().reset_index(name='count')

crime_gdf = crime_gdf[crime_gdf['geometry'].notnull()]
crime_gdf = crime_gdf[crime_gdf['geometry'].astype(object).apply(lambda geom: geom.is_valid if geom else False)]

crime_map = crime_gdf.hvplot.points(
    'geometry.x', 'geometry.y',
    geo=True,
    tiles='OSM',
    groupby='year_month',
    color='crime_category',
    title='Crime Events by Month and Category',
    size=5,
    alpha=0.6,
    height=600,
    width=800
)
C:\Users\44792\miniforge3\envs\musa-550-fall-2023\lib\site-packages\geopandas\geoseries.py:751: UserWarning: GeoSeries.notna() previously returned False for both missing (None) and empty geometries. Now, it only returns False for missing values. Since the calling GeoSeries contains empty geometries, the result has changed compared to previous versions of GeoPandas.
Given a GeoSeries 's', you can use '~s.is_empty & s.notna()' to get back the old behaviour.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', 'GeoSeries.notna', UserWarning)
  return self.notna()
#try to see crime by types!
crime_monthly = crime_gdf.groupby(['text_general_code', 'year_month']).size().reset_index(name='count')

crime_plot = crime_monthly.hvplot.line(
    x='year_month', 
    y='count', 
    by='text_general_code',
    title='Monthly Crime Trends by Type', 
    xlabel='Month', 
    ylabel='Number of Crimes', 
    legend='top_left', 
    width=1000, 
    height=500
)

crime_plot
city_limits_layer = city_limits.hvplot.polygons(
    geo=True,
    tiles="CartoLight",  # Set a light gray basemap
    line_width=2,
    line_color='black',
    fill_alpha=0.1,
    height=400,
    width=600
)

city_limits_layer
from bokeh.palettes import inferno

crime_categories = ['Violent Crime', 'Property Crime', 'Drug/Alcohol Crime', 'Sexual Offenses', 'Miscellaneous']
crime_colors = {category: inferno(len(crime_categories))[i] for i, category in enumerate(crime_categories)}
#now lets visualize some data!
city_limits_layer = city_limits.hvplot.polygons(
    geo=True,
    tiles=None,
    line_width=2,
    line_color='red',
    fill_alpha=0,
    height=700,
    width=900
)

# Create a Panel interactive dashboard
month_slider = pn.widgets.IntSlider(name='Month', start=0, end=len(crime_gdf['year_month'].unique()) - 1, step=1, value=0)
month_list = sorted(crime_gdf['year_month'].unique())

@pn.depends(month_slider.param.value, watch=True)
def update_month(selected_index):
    return month_list[selected_index]

type_selector = pn.widgets.CheckBoxGroup(
    name='Crime Types',
    value=list(crime_colors.keys()),
    options=list(crime_colors.keys())
)

@pn.depends(month_slider.param.value, type_selector.param.value)
def update_map(selected_index, selected_types):
    selected_month = month_list[selected_index]
    filtered_data = crime_gdf[
        (crime_gdf['year_month'] == selected_month) & 
        (crime_gdf['crime_category'].isin(selected_types))
    ]
    crime_layer = filtered_data.hvplot.points(
        'geometry.x', 'geometry.y',
        geo=True,
        tiles="CartoLight",
        color='crime_category',
        cmap=crime_colors,
        size=5,
        alpha=0.6,
        height=700,
        width=900
    )
    return city_limits_layer * crime_layer

@pn.depends(month_slider.param.value, type_selector.param.value)
def update_bar_chart(selected_index, selected_types):
    selected_month = month_list[selected_index]  # 从索引获取月份
    filtered_data = crime_gdf[
        (crime_gdf['year_month'] == selected_month) & 
        (crime_gdf['crime_category'].isin(selected_types))
    ]
    monthly_counts = filtered_data.groupby('crime_category').size().reset_index(name='count')
    bar_chart = alt.Chart(monthly_counts).mark_bar().encode(
        x=alt.X('crime_category', sort='-y', title='Crime Category'),
        y=alt.Y('count', title='Count'),
        color=alt.Color('crime_category', scale=alt.Scale(domain=list(crime_colors.keys()), range=list(crime_colors.values())))
    ).properties(
        title=f"Crime Categories for {selected_month}",
        width=400,
        height=250
    )
    return bar_chart

@pn.depends(type_selector.param.value)
def update_line_chart(selected_types):
    filtered_data = crime_gdf[crime_gdf['crime_category'].isin(selected_types)]
    monthly_counts = filtered_data.groupby(['year_month', 'crime_category']).size().reset_index(name='count')
    line_chart = alt.Chart(monthly_counts).mark_line(point=True).encode(
        x=alt.X('year_month:T', title='Month'),
        y=alt.Y('count:Q', title='Count'),
        color=alt.Color('crime_category:N', scale=alt.Scale(domain=list(crime_colors.keys()), range=list(crime_colors.values())))
    ).properties(
        title="Crime Trends by Month",
        width=400,
        height=250
    )
    return line_chart

# Combine bar chart and line chart into a single vertical panel
@pn.depends(month_slider.param.value, type_selector.param.value)
def combined_chart(selected_index, selected_types):
    bar = update_bar_chart(selected_index, selected_types)
    line = update_line_chart(selected_types)
    return pn.Column(pn.pane.Vega(bar), pn.pane.Vega(line), sizing_mode='stretch_width')

# Update layout to include combined charts in the map
crime_dashboard = pn.Column(
    "## Crime Dashboard",
    month_slider,
    type_selector,
    pn.Row(
        update_map,
        combined_chart,
        sizing_mode='stretch_width'
    )
)

crime_dashboard.servable()
WARNING:param.main: VegaPlot was not imported on instantiation and may not render in a notebook. Restart the notebook kernel and ensure you load it as part of the extension using:

pn.extension('vega')
####Social Data Part!####
import cenpy as cny

#cny.set_sitekey("86e9f13d3e07b9344204c7b748ce3feef9772df5")
acs_dataset = "ACSDT5Y2023"  # ACS 5-Year dataset for 2023
conn = cny.products.APIConnection(acs_dataset)

variables_df = conn.variables
print(variables_df.head())
                                                          label  \
for                                Census API FIPS 'for' clause   
in                                  Census API FIPS 'in' clause   
ucgid                Uniform Census Geography Identifier clause   
B24022_060E   Estimate!!Total:!!Female:!!Service occupations...   
B19001B_014E             Estimate!!Total:!!$100,000 to $124,999   

                                                        concept predicateType  \
for                          Census API Geography Specification      fips-for   
in                           Census API Geography Specification       fips-in   
ucgid                        Census API Geography Specification         ucgid   
B24022_060E   Sex by Occupation and Median Earnings in the P...           int   
B19001B_014E  Household Income in the Past 12 Months (in 202...           int   

                group limit predicateOnly hasGeoCollectionSupport  \
for               N/A     0          True                     NaN   
in                N/A     0          True                     NaN   
ucgid             N/A     0          True                    True   
B24022_060E    B24022     0           NaN                     NaN   
B19001B_014E  B19001B     0           NaN                     NaN   

                                            attributes required  
for                                                NaN      NaN  
in                                                 NaN      NaN  
ucgid                                              NaN      NaN  
B24022_060E      B24022_060EA,B24022_060M,B24022_060MA      NaN  
B19001B_014E  B19001B_014EA,B19001B_014M,B19001B_014MA      NaN  
#scape some social data here to visualize
acs_variables = {
    "B01001_001E": "Total Population",
    "B03002_006E": "Asian Alone",
    "B03002_004E": "Black or African American Alone",
    "B03003_003E": "Hispanic or Latino",
    "B03002_003E": "White Alone",
    "B01002_001E": "Median Age",
    "B11001_001E": "Total Households",
    "B11001_002E": "Total Families",
    "B11001_003E": "Married-couple Family",
    "B25001_001E": "Total Housing Units",
    "B25002_002E": "Occupied Housing Units",
    "B25002_003E": "Vacant Housing Units",
    "B25003_002E": "Owner-occupied Housing Units",
    "B25003_003E": "Renter-occupied Housing Units",
    "B19013_001E": "Median Household Income",
    "B17010_001E": "Poverty Status Universe",
    "B17010_002E": "Below Poverty Level"
}

acs = cny.products.ACS()
data = acs.from_county(
    county="Philadelphia County, PA",
    variables=list(acs_variables.keys()),
    level="tract",
    return_geometry=True
)

data = data.rename(columns=dict(zip(acs_variables.keys(), acs_variables.values())))

data.to_file("Philadelphia_ACS_2023.shp")
C:\Users\Public\Documents\Wondershare\CreatorTemp\ipykernel_37844\1812328781.py:32: UserWarning: Column names longer than 10 characters will be truncated when saved to ESRI Shapefile.
  data.to_file("Philadelphia_ACS_2023.shp")
social_file_path = "Philadelphia_ACS_2023.shp"
acs_gdf = gpd.read_file(social_file_path)

if acs_gdf.crs.to_string() != "EPSG:4326":
    acs_gdf = acs_gdf.to_crs(epsg=4326)

print("CRS:", acs_gdf.crs)
print("Columns:", acs_gdf.columns)
print(acs_gdf.head())

acs_gdf.explore()
CRS: EPSG:4326
Columns: Index(['GEOID', 'Total Popu', 'Median Age', 'White Alon', 'Black or A',
       'Asian Alon', 'Hispanic o', 'Total Hous', 'Total Fami', 'Married-co',
       'Poverty St', 'Below Pove', 'Median Hou', 'Total Ho_1', 'Occupied H',
       'Vacant Hou', 'Owner-occu', 'Renter-occ', 'NAME', 'state', 'county',
       'tract', 'geometry'],
      dtype='object')
         GEOID  Total Popu  Median Age  White Alon  Black or A  Asian Alon  \
0  42101009802      6190.0        33.5       580.0      5341.0        35.0   
1  42101037500      3736.0        31.3      1672.0      1467.0       165.0   
2  42101021900      1434.0        44.7      1307.0        50.0        24.0   
3  42101011300      3344.0        28.5        62.0      3229.0         0.0   
4  42101021800      5141.0        30.9      2822.0      1497.0       247.0   

   Hispanic o  Total Hous  Total Fami  Married-co  ...  Total Ho_1  \
0       149.0      2129.0      1467.0       576.0  ...      2371.0   
1       298.0      1224.0       733.0       547.0  ...      1383.0   
2        41.0       654.0       307.0       238.0  ...       751.0   
3        53.0      1102.0       770.0       133.0  ...      1392.0   
4       178.0      2242.0      1073.0       887.0  ...      2394.0   

   Occupied H  Vacant Hou  Owner-occu  Renter-occ  \
0      2129.0       242.0      1593.0       536.0   
1      1224.0       159.0       722.0       502.0   
2       654.0        97.0       551.0       103.0   
3      1102.0       290.0       621.0       481.0   
4      2242.0       152.0       597.0      1645.0   

                                                NAME  state  county   tract  \
0  Census Tract 98.02, Philadelphia County, Penns...     42     101  009802   
1  Census Tract 375, Philadelphia County, Pennsyl...     42     101  037500   
2  Census Tract 219, Philadelphia County, Pennsyl...     42     101  021900   
3  Census Tract 113, Philadelphia County, Pennsyl...     42     101  011300   
4  Census Tract 218, Philadelphia County, Pennsyl...     42     101  021800   

                                            geometry  
0  POLYGON ((-75.27547 39.97743, -75.27528 39.977...  
1  POLYGON ((-75.26551 39.98186, -75.26475 39.982...  
2  POLYGON ((-75.25438 40.04657, -75.25384 40.046...  
3  POLYGON ((-75.23947 39.98111, -75.23942 39.981...  
4  POLYGON ((-75.23807 40.05988, -75.23622 40.061...  

[5 rows x 23 columns]
Make this Notebook Trusted to load map: File -> Trust Notebook
visualizable_columns = [
    col for col in acs_gdf.columns if acs_gdf[col].dtype in [float, int]
]

print(visualizable_columns)
['Total Popu', 'Median Age', 'White Alon', 'Black or A', 'Asian Alon', 'Hispanic o', 'Total Hous', 'Total Fami', 'Married-co', 'Poverty St', 'Below Pove', 'Median Hou', 'Total Ho_1', 'Occupied H', 'Vacant Hou', 'Owner-occu', 'Renter-occ']
#Now lets see how social data works!

import folium
from folium import GeoJson, Tooltip
import panel as pn
import mapclassify as mc
import branca.colormap as cm

classification_methods = ['Natural Breaks', 'Equal Interval']
num_classes_options = list(range(5, 11))

data_selector = pn.widgets.Select(
    name="Select Data to Visualize", 
    options=visualizable_columns,
    value=visualizable_columns[0]
)
classification_selector = pn.widgets.Select(
    name="Select Classification Method",
    options=classification_methods,
    value=classification_methods[0]
)
num_classes_selector = pn.widgets.IntSlider(
    name="Select Number of Classes",
    start=5, 
    end=8, 
    step=1, 
    value=5
)

# data update function
@pn.depends(
    data_selector.param.value,
    classification_selector.param.value,
    num_classes_selector.param.value
)
def update_map(selected_column, classification_method, num_classes):
    m = folium.Map(
        location=[39.9526, -75.1652],
        zoom_start=12,
        tiles="CartoDB positron",
        control_scale=True,
        width=900,
        height=600
    )
    
    if classification_method == 'Natural Breaks':
        classifier = mc.NaturalBreaks(acs_gdf[selected_column], k=num_classes)
    elif classification_method == 'Equal Interval':
        classifier = mc.EqualInterval(acs_gdf[selected_column], k=num_classes)
    
    acs_gdf['class'] = classifier.yb
    bins = classifier.bins
    
    colormap = cm.LinearColormap(
        colors=['#f7fcf0', '#c7e9c0', '#7fcdbb', '#41b6c4', '#2c7fb8', '#253494'],
        vmin=acs_gdf[selected_column].min(),
        vmax=acs_gdf[selected_column].max(),
        caption=f"{selected_column} ({classification_method})"
    )
    

    fill_color_mapping = {i: colormap(v) for i, v in enumerate(bins)}

    for _, row in acs_gdf.iterrows():
        tooltip_text = (
            f"<b>{row['NAME']}</b><br>"
            f"{selected_column}: {row[selected_column]:,.2f}"
        )
        GeoJson(
            row['geometry'],
            style_function=lambda x, row=row: {
                "fillColor": fill_color_mapping[row['class']],
                "color": "transparent", 
                "weight": 0,
                "fillOpacity": 0.7,
            },
            tooltip=Tooltip(tooltip_text)
        ).add_to(m)
    
    colormap.add_to(m)
    return m

dashboard = pn.Column(
    pn.pane.Markdown("## Philadelphia ACS Data Visualization"),
    data_selector,
    classification_selector,
    num_classes_selector,
    pn.panel(update_map, sizing_mode="stretch_width", height=600)
)

dashboard.servable()
C:\Users\44792\miniforge3\envs\musa-550-fall-2023\lib\site-packages\joblib\externals\loky\backend\context.py:136: UserWarning: Could not find the number of physical cores for the following reason:
found 0 physical cores < 1
Returning the number of logical cores instead. You can silence this warning by setting LOKY_MAX_CPU_COUNT to the number of cores you want to use.
  warnings.warn(
  File "C:\Users\44792\miniforge3\envs\musa-550-fall-2023\lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")
C:\Users\44792\miniforge3\envs\musa-550-fall-2023\lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2.
  warnings.warn(
####Go with the housing data web scraping
Housing_API_URL = "https://phl.carto.com/api/v2/sql"

start_date = "2022-01-01"
end_date = "2024-12-23"

query = f"""
SELECT *
FROM opa_properties_public
WHERE assessment_date >= '{start_date}' AND assessment_date <= '{end_date}'
"""

response = requests.get(f"{Housing_API_URL}?q={query}")
if response.status_code != 200:
    raise Exception(f"API request failed with status {response.status_code}: {response.text}")
#got so many columns that some we wont need later

Housing_data = response.json()
properties_df = pd.DataFrame(Housing_data['rows'])

print("Columns in the dataset:", properties_df.columns)
Columns in the dataset: Index(['cartodb_id', 'the_geom', 'the_geom_webmercator', 'assessment_date',
       'basements', 'beginning_point', 'book_and_page', 'building_code',
       'building_code_description', 'category_code',
       'category_code_description', 'census_tract', 'central_air',
       'cross_reference', 'date_exterior_condition', 'depth',
       'exempt_building', 'exempt_land', 'exterior_condition', 'fireplaces',
       'frontage', 'fuel', 'garage_spaces', 'garage_type',
       'general_construction', 'geographic_ward', 'homestead_exemption',
       'house_extension', 'house_number', 'interior_condition', 'location',
       'mailing_address_1', 'mailing_address_2', 'mailing_care_of',
       'mailing_city_state', 'mailing_street', 'mailing_zip', 'market_value',
       'market_value_date', 'number_of_bathrooms', 'number_of_bedrooms',
       'number_of_rooms', 'number_stories', 'off_street_open',
       'other_building', 'owner_1', 'owner_2', 'parcel_number', 'parcel_shape',
       'quality_grade', 'recording_date', 'registry_number', 'sale_date',
       'sale_price', 'separate_utilities', 'sewer', 'site_type', 'state_code',
       'street_code', 'street_designation', 'street_direction', 'street_name',
       'suffix', 'taxable_building', 'taxable_land', 'topography',
       'total_area', 'total_livable_area', 'type_heater', 'unfinished', 'unit',
       'utility', 'view_type', 'year_built', 'year_built_estimate', 'zip_code',
       'zoning', 'pin', 'building_code_new', 'building_code_description_new',
       'objectid'],
      dtype='object')
properties_df.tail()
cartodb_id the_geom the_geom_webmercator assessment_date basements beginning_point book_and_page building_code building_code_description category_code ... utility view_type year_built year_built_estimate zip_code zoning pin building_code_new building_code_description_new objectid
583657 584047 0101000020E6100000CA62F43784C452C0553088271300... 0101000020110F0000DF8AED67F4E05FC1CB865FCEFA8F... 2024-06-06T16:07:08Z D 417'8 3/4" NE LEFEVRE 0000000 H36 SEMI/DET 2 STY FRAME 1 ... None I 1920 None 19137 RSA5 1001192329 32 TWIN CONVENTIONAL 563626477
583658 584048 0101000020E61000002876388229C352C0DAF8E3121703... 0101000020110F00002E29FC7BA7DE5FC15D1760C65293... 2024-06-06T16:05:14Z None 15' 1/2"SE DITMAN 0000000 O30 ROW 2 STY MASONRY 1 ... None I 1920 Y 19135 RSA5 1001342218 24 ROW PORCH FRONT 563626478
583659 584049 0101000020E61000009638723466CA52C07637567AF002... 0101000020110F0000C30DA979F2EA5FC11BC631F82793... 2024-11-16T12:29:40Z None SE COR APSLEY ST None LB0 IND. LGHT MFG MASONRY 14 ... None I 2023 None 19144 CMX3 1001681092 830 APARTMENTS - MID RISE 563626479
583660 584050 0101000020E61000009638723466CA52C07637567AF002... 0101000020110F0000C30DA979F2EA5FC11BC631F82793... 2024-12-14T16:53:16Z None SE COR APSLEY ST None LB0 IND. LGHT MFG MASONRY 10 ... None I 1920 None 19144 CMX3 1001681093 241 OFFICE BLDNG - Low Rise 563626480
583661 584051 0101000020E6100000C75E7D144DC552C04F40CC0FA801... 0101000020110F0000F891E19649E25FC1DADDADC3BB91... 2024-06-06T16:05:55Z None 128'9" N GILLINGHAM 0000000 SR VACANT LAND RESIDE < ACRE 6 ... None E None None 19124 ICMX 1001579732 None None 563626481

5 rows × 81 columns

from shapely import wkb

def safe_wkb_loads(wkb_str):
    try:
        return wkb.loads(bytes.fromhex(wkb_str))
    except Exception:
        return None

properties_df['parsed_geometry'] = properties_df['the_geom'].apply(safe_wkb_loads)

properties_with_geometry = properties_df[properties_df['parsed_geometry'].notnull()]

print(f"Number of properties after filtering: {len(properties_with_geometry)}")
print(properties_with_geometry.tail())
Number of properties after filtering: 583662
        cartodb_id                                           the_geom  \
583657      584047  0101000020E6100000CA62F43784C452C0553088271300...   
583658      584048  0101000020E61000002876388229C352C0DAF8E3121703...   
583659      584049  0101000020E61000009638723466CA52C07637567AF002...   
583660      584050  0101000020E61000009638723466CA52C07637567AF002...   
583661      584051  0101000020E6100000C75E7D144DC552C04F40CC0FA801...   

                                     the_geom_webmercator  \
583657  0101000020110F0000DF8AED67F4E05FC1CB865FCEFA8F...   
583658  0101000020110F00002E29FC7BA7DE5FC15D1760C65293...   
583659  0101000020110F0000C30DA979F2EA5FC11BC631F82793...   
583660  0101000020110F0000C30DA979F2EA5FC11BC631F82793...   
583661  0101000020110F0000F891E19649E25FC1DADDADC3BB91...   

             assessment_date basements        beginning_point book_and_page  \
583657  2024-06-06T16:07:08Z         D  417'8 3/4" NE LEFEVRE       0000000   
583658  2024-06-06T16:05:14Z      None      15' 1/2"SE DITMAN       0000000   
583659  2024-11-16T12:29:40Z      None       SE COR APSLEY ST          None   
583660  2024-12-14T16:53:16Z      None       SE COR APSLEY ST          None   
583661  2024-06-06T16:05:55Z      None    128'9" N GILLINGHAM       0000000   

       building_code  building_code_description category_code  ... view_type  \
583657         H36         SEMI/DET 2 STY FRAME            1   ...         I   
583658         O30            ROW 2 STY MASONRY            1   ...         I   
583659         LB0        IND. LGHT MFG MASONRY            14  ...         I   
583660         LB0        IND. LGHT MFG MASONRY            10  ...         I   
583661         SR     VACANT LAND RESIDE < ACRE            6   ...         E   

       year_built year_built_estimate zip_code zoning         pin  \
583657       1920                None    19137   RSA5  1001192329   
583658       1920                   Y    19135   RSA5  1001342218   
583659       2023                None    19144   CMX3  1001681092   
583660       1920                None    19144   CMX3  1001681093   
583661       None                None    19124   ICMX  1001579732   

        building_code_new  building_code_description_new   objectid  \
583657                 32              TWIN CONVENTIONAL  563626477   
583658                 24                ROW PORCH FRONT  563626478   
583659                830          APARTMENTS - MID RISE  563626479   
583660                241        OFFICE BLDNG - Low Rise  563626480   
583661               None                           None  563626481   

                                     parsed_geometry  
583657  POINT (-75.07056998124895 40.00058454656452)  
583658  POINT (-75.04940848840545 40.02414165622186)  
583659  POINT (-75.16248809008025 40.02296380243108)  
583660  POINT (-75.16248809008025 40.02296380243108)  
583661  POINT (-75.08282959216295 40.01294133637622)  

[5 rows x 82 columns]
columns_to_drop_housing = [
    'beginning_point', 'book_and_page', 'building_code',
    'building_code_description', 'category_code',
    'category_code_description', 'cross_reference', 'date_exterior_condition', 'depth',
    'exempt_building', 'exempt_land', 'exterior_condition',
    'general_construction', 'geographic_ward', 'homestead_exemption',
    'house_extension', 'location',
    'mailing_address_1', 'mailing_address_2', 'mailing_care_of',
    'mailing_city_state', 'mailing_street', 'mailing_zip',
    'other_building', 'owner_1', 'owner_2', 'parcel_number', 'parcel_shape', 
    'registry_number', 'separate_utilities', 'sewer', 'site_type', 'state_code',
    'street_code', 'street_designation', 'street_direction', 'street_name',
    'suffix', 'taxable_building', 'taxable_land', 'topography', 'unfinished', 'unit',
    'utility', 'view_type', 'zip_code', 
    'zoning', 'pin', 'building_code_new', 'building_code_description_new'
]

columns_present_housing = [col for col in columns_to_drop_housing if col in properties_df.columns]
cleaned_properties_df = properties_df.drop(columns=columns_present_housing, axis=1)

print("Remaining columns in the cleaned dataset:")
print(cleaned_properties_df.columns)
Remaining columns in the cleaned dataset:
Index(['cartodb_id', 'the_geom', 'the_geom_webmercator', 'assessment_date',
       'basements', 'census_tract', 'central_air', 'fireplaces', 'frontage',
       'fuel', 'garage_spaces', 'garage_type', 'house_number',
       'interior_condition', 'market_value', 'market_value_date',
       'number_of_bathrooms', 'number_of_bedrooms', 'number_of_rooms',
       'number_stories', 'off_street_open', 'quality_grade', 'recording_date',
       'sale_date', 'sale_price', 'total_area', 'total_livable_area',
       'type_heater', 'year_built', 'year_built_estimate', 'objectid',
       'parsed_geometry'],
      dtype='object')
cleaned_properties_gdf = gpd.GeoDataFrame(
    cleaned_properties_df,
    geometry='parsed_geometry',
    crs="EPSG:4326"
)

print(f"Number of properties after filtering and column dropping: {len(cleaned_properties_gdf)}")
cleaned_properties_gdf.head()
Number of properties after filtering and column dropping: 583662
cartodb_id the_geom the_geom_webmercator assessment_date basements census_tract central_air fireplaces frontage fuel ... recording_date sale_date sale_price total_area total_livable_area type_heater year_built year_built_estimate objectid parsed_geometry
0 1 0101000020E6100000DBD817B5A3CB52C0B69AFD4035FF... 0101000020110F000090B604C90DED5FC1CB2D85CC048F... 2024-06-06T16:10:03Z None 169 None NaN 14.0 None ... 2071-12-27T05:00:00Z 1971-12-27T05:00:00Z 1.0 639.0 NaN None None None 563042662 POINT (-75.18187 39.99381)
1 2 0101000020E61000006BFC9A49A4CB52C058DAA6D48BFE... 0101000020110F0000E00B48C50EED5FC1019687FC488E... 2024-06-06T16:08:52Z None 151 None NaN 14.0 None ... 2071-12-27T05:00:00Z 2071-12-27T05:00:00Z 1.0 674.0 NaN None None None 563042663 POINT (-75.18190 39.98864)
2 3 0101000020E6100000111C0D6E98CF52C0573761511EF6... 0101000020110F00000FE20CFFC5F35FC1FBFAC589F284... 2024-06-06T16:12:21Z None 64 None NaN 72.0 None ... 2071-12-17T05:00:00Z 2071-12-17T05:00:00Z 1.0 6636.0 2400.0 None 1925 None 563042664 POINT (-75.24368 39.92280)
3 4 0101000020E61000009151A5AE7ACB52C0B624691A9BFE... 0101000020110F00007D008E19C8EC5FC11B9681EA598E... 2024-06-06T16:09:40Z None 151 None NaN 16.0 None ... 2071-12-16T05:00:00Z 2071-12-16T05:00:00Z 1.0 992.0 NaN None None None 563042665 POINT (-75.17936 39.98911)
4 5 0101000020E61000005A33DE5D9ECA52C0F73C1F77F5FD... 0101000020110F0000EB7628DF51EB5FC120B8F14FA28D... 2024-06-06T16:08:58Z None 152 None NaN 15.0 None ... 2071-12-14T05:00:00Z 2071-12-14T05:00:00Z 1.0 1122.0 NaN None None None 563042666 POINT (-75.16592 39.98405)

5 rows × 32 columns

#from above, we have cleared up the dataset to what we want, and can see there are 583thousand more data
import datashader as ds
import datashader.transfer_functions as tf
from datashader.utils import lnglat_to_meters
import geopandas as gpd

from datashader.colors import Greys9, viridis, inferno
from colorcet import fire
#here trying to show how did the data points spread out bounded by city limits
philadelphia_range_path = "final_data/city_limits_3857.geojson"
city_limits = gpd.read_file(philadelphia_range_path)

if city_limits.crs.to_string() != "EPSG:4326":
    city_limits = city_limits.to_crs(epsg=4326)

cleaned_properties_gdf = cleaned_properties_gdf.to_crs(epsg=3857)
city_limits = city_limits.to_crs(epsg=3857)

city_bounds = city_limits.total_bounds
x_range, y_range = (city_bounds[0], city_bounds[2]), (city_bounds[1], city_bounds[3])

cleaned_properties_gdf["x"] = cleaned_properties_gdf.geometry.x
cleaned_properties_gdf["y"] = cleaned_properties_gdf.geometry.y

canvas = ds.Canvas(
    plot_width=800,
    plot_height=600,
    x_range=(city_limits.total_bounds[0], city_limits.total_bounds[2]),  # x 范围
    y_range=(city_limits.total_bounds[1], city_limits.total_bounds[3]),  # y 范围
)

agg = canvas.points(cleaned_properties_gdf, "x", "y", agg=ds.count())

image = tf.shade(agg, cmap=["lightblue", "blue", "darkblue"], how="log")

fig, ax = plt.subplots(figsize=(10, 8))

ax.imshow(
    image.to_pil(),
    extent=(
        city_limits.total_bounds[0],  # xmin
        city_limits.total_bounds[2],  # xmax
        city_limits.total_bounds[1],  # ymin
        city_limits.total_bounds[3],  # ymax
    ),
    origin="upper",
)

city_limits.plot(ax=ax, edgecolor="red", linewidth=1, facecolor="none")

ax.set_title("Philadelphia Housing Properties Distribution (Sampled)", fontsize=16)

plt.show()

#now lets explore the housing data by month and by census tract
import geopandas as gpd
from shapely.geometry import Point
import os

os.chdir(r"C:\Users\44792\Desktop\essay help master\5500 MUSA Python\24fall-python-final-yu_qiushi_final_project")
print(f"Changed working directory to: {os.getcwd()}")

relative_path = "final/Philadelphia_ACS_2023.shp"

try:
    census_gdf = gpd.read_file(relative_path)
except FileNotFoundError:
    raise FileNotFoundError(f"The file at {relative_path} does not exist. Please check the path.")

if census_gdf.crs != "EPSG:3857":
    census_gdf = census_gdf.to_crs(epsg=3857)

print(f"Census Tract data loaded with {len(census_gdf)} rows.")
Changed working directory to: C:\Users\44792\Desktop\essay help master\5500 MUSA Python\24fall-python-final-yu_qiushi_final_project
Census Tract data loaded with 384 rows.
census_gdf.tail()
GEOID Total Popu Median Age White Alon Black or A Asian Alon Hispanic o Total Hous Total Fami Married-co ... Total Ho_1 Occupied H Vacant Hou Owner-occu Renter-occ NAME state county tract geometry
379 42101035602 3661.0 42.1 2518.0 73.0 707.0 307.0 1252.0 999.0 834.0 ... 1284.0 1252.0 32.0 1022.0 230.0 Census Tract 356.02, Philadelphia County, Penn... 42 101 035602 POLYGON ((-8355008.800 4881749.700, -8354858.9...
380 42101035601 5272.0 55.4 4069.0 140.0 887.0 127.0 2155.0 1344.0 964.0 ... 2404.0 2155.0 249.0 1398.0 757.0 Census Tract 356.01, Philadelphia County, Penn... 42 101 035601 POLYGON ((-8354256.940 4879308.850, -8354227.1...
381 42101033102 4048.0 34.3 3107.0 299.0 31.0 438.0 1556.0 970.0 651.0 ... 1760.0 1556.0 204.0 935.0 621.0 Census Tract 331.02, Philadelphia County, Penn... 42 101 033102 POLYGON ((-8352727.080 4872434.270, -8352711.8...
382 42101034801 4600.0 44.6 3812.0 220.0 201.0 342.0 1969.0 1215.0 761.0 ... 2006.0 1969.0 37.0 922.0 1047.0 Census Tract 348.01, Philadelphia County, Penn... 42 101 034801 POLYGON ((-8351724.540 4874706.750, -8351706.9...
383 42101036201 5153.0 38.5 4195.0 308.0 0.0 419.0 1981.0 1257.0 839.0 ... 2153.0 1981.0 172.0 1327.0 654.0 Census Tract 362.01, Philadelphia County, Penn... 42 101 036201 POLYGON ((-8348332.630 4877819.050, -8348033.1...

5 rows × 23 columns

#here is trying to join the property data with the social data together, and i experiment about to show sale prices
properties_with_tract = gpd.sjoin(
    cleaned_properties_gdf, census_gdf, how="inner", predicate="intersects"
)

properties_with_tract["year_month"] = pd.to_datetime(properties_with_tract["assessment_date"]).dt.to_period("M")

aggregated_data = properties_with_tract.groupby(["GEOID", "year_month"]).agg(
    property_count=("cartodb_id", "count"),
    avg_sale_price=("sale_price", "mean")
).reset_index()

aggregated_data = aggregated_data.merge(
    census_gdf[["GEOID", "geometry"]], on="GEOID", how="left"
)
aggregated_gdf = gpd.GeoDataFrame(aggregated_data, geometry="geometry", crs="EPSG:3857")
C:\Users\Public\Documents\Wondershare\CreatorTemp\ipykernel_37844\1386209769.py:6: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.
  properties_with_tract["year_month"] = pd.to_datetime(properties_with_tract["assessment_date"]).dt.to_period("M")
#and we can see, for same tract, not every month will have property register record
aggregated_gdf.tail()
GEOID year_month property_count avg_sale_price geometry
2386 42101980900 2024-06 338 2.272857e+06 POLYGON ((-8378024.880 4848026.880, -8377966.4...
2387 42101980900 2024-09 7 9.213524e+06 POLYGON ((-8378024.880 4848026.880, -8377966.4...
2388 42101980900 2024-11 1 5.655000e+05 POLYGON ((-8378024.880 4848026.880, -8377966.4...
2389 42101989100 2023-05 4 1.250000e+00 POLYGON ((-8351467.390 4870423.710, -8351384.2...
2390 42101989100 2024-06 45 7.283044e+05 POLYGON ((-8351467.390 4870423.710, -8351384.2...
example_month = "2024-06"
plot_data = aggregated_gdf[aggregated_gdf["year_month"] == example_month]

fig, ax = plt.subplots(1, 1, figsize=(10, 10))
census_gdf.plot(ax=ax, color="lightgrey", edgecolor="black")
plot_data.plot(ax=ax, column="property_count", cmap="OrRd", legend=True)
ax.set_title(f"Housing Data Aggregation - {example_month}")
plt.show()

#now we will investigate the pattern of house prices and their character by regions
import geopandas as gpd
from folium import GeoJson

relative_path = "final/Philadelphia_ACS_2023.shp"

try:
    census_gdf = gpd.read_file(relative_path)
except FileNotFoundError:
    raise FileNotFoundError(f"The file at {relative_path} does not exist. Please check the path.")

if census_gdf.crs != "EPSG:3857":
    census_gdf = census_gdf.to_crs(epsg=3857)

print(f"Census Tract data loaded with {len(census_gdf)} rows.")
Census Tract data loaded with 384 rows.
properties_with_tract = gpd.sjoin(
    cleaned_properties_gdf, census_gdf, how="inner", predicate="intersects"
)

properties_with_tract["year_month"] = pd.to_datetime(properties_with_tract["assessment_date"]).dt.to_period("M")

aggregated_data = properties_with_tract.groupby(["GEOID", "year_month"]).agg(
    property_count=("cartodb_id", "count"),
    avg_sale_price=("sale_price", "mean")
).reset_index()

aggregated_data['GEOID'] = aggregated_data['GEOID'].astype(str)
census_gdf['GEOID'] = census_gdf['GEOID'].astype(str)

common_geoids = set(aggregated_data['GEOID']).intersection(set(census_gdf['GEOID']))

filtered_census_gdf = census_gdf[census_gdf['GEOID'].isin(common_geoids)]
filtered_aggregated_gdf = aggregated_data[aggregated_data['GEOID'].isin(common_geoids)]

final_aggregated_gdf = filtered_aggregated_gdf.merge(
    filtered_census_gdf[['GEOID', 'geometry']], 
    on="GEOID", 
    how="left"
)

final_aggregated_gdf = gpd.GeoDataFrame(final_aggregated_gdf, geometry="geometry", crs="EPSG:3857")

print(f"Final Aggregated Data Count: {len(final_aggregated_gdf)}")
Final Aggregated Data Count: 2391
C:\Users\Public\Documents\Wondershare\CreatorTemp\ipykernel_37844\1647549544.py:5: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.
  properties_with_tract["year_month"] = pd.to_datetime(properties_with_tract["assessment_date"]).dt.to_period("M")
final_aggregated_gdf.tail()
GEOID year_month property_count avg_sale_price geometry
2386 42101980900 2024-06 338 2.272857e+06 POLYGON ((-8378024.880 4848026.880, -8377966.4...
2387 42101980900 2024-09 7 9.213524e+06 POLYGON ((-8378024.880 4848026.880, -8377966.4...
2388 42101980900 2024-11 1 5.655000e+05 POLYGON ((-8378024.880 4848026.880, -8377966.4...
2389 42101989100 2023-05 4 1.250000e+00 POLYGON ((-8351467.390 4870423.710, -8351384.2...
2390 42101989100 2024-06 45 7.283044e+05 POLYGON ((-8351467.390 4870423.710, -8351384.2...
#here, I try to join social and housing data together
properties_with_tract = gpd.sjoin(
    cleaned_properties_gdf, census_gdf, how="inner", predicate="intersects"
)

buffered_census_gdf = census_gdf.copy()
buffered_census_gdf["geometry"] = buffered_census_gdf.geometry.buffer(10)

properties_with_tract = gpd.sjoin(
    cleaned_properties_gdf, buffered_census_gdf, how="inner", predicate="intersects"
)

properties_with_tract = gpd.GeoDataFrame(
    properties_with_tract,
    geometry='parsed_geometry',
    crs=cleaned_properties_gdf.crs
)

print(len(properties_with_tract))

properties_with_tract["assessment_date"] = pd.to_datetime(properties_with_tract["assessment_date"], errors="coerce")
properties_with_tract["year_month"] = properties_with_tract["assessment_date"].dt.to_period("M")
583942
C:\Users\Public\Documents\Wondershare\CreatorTemp\ipykernel_37844\2193257280.py:22: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.
  properties_with_tract["year_month"] = properties_with_tract["assessment_date"].dt.to_period("M")
if properties_with_tract["year_month"].isnull().sum() > 0:
    properties_with_tract["year_month"] = properties_with_tract["year_month"].fillna("Unknown")

properties_with_tract.tail()

#now datas are toegther!
cartodb_id the_geom the_geom_webmercator assessment_date basements census_tract central_air fireplaces frontage fuel ... Total Ho_1 Occupied H Vacant Hou Owner-occu Renter-occ NAME state county tract year_month
583657 584047 0101000020E6100000CA62F43784C452C0553088271300... 0101000020110F0000DF8AED67F4E05FC1CB865FCEFA8F... 2024-06-06 16:07:08+00:00 D 183 N 0.0 20.0 None ... 1753.0 1539.0 214.0 1344.0 195.0 Census Tract 183, Philadelphia County, Pennsyl... 42 101 018300 2024-06
583658 584048 0101000020E61000002876388229C352C0DAF8E3121703... 0101000020110F00002E29FC7BA7DE5FC15D1760C65293... 2024-06-06 16:05:14+00:00 None 323 N 0.0 72.0 None ... 1611.0 1419.0 192.0 787.0 632.0 Census Tract 323, Philadelphia County, Pennsyl... 42 101 032300 2024-06
583659 584049 0101000020E61000009638723466CA52C07637567AF002... 0101000020110F0000C30DA979F2EA5FC11BC631F82793... 2024-11-16 12:29:40+00:00 None 244 None NaN NaN None ... 1301.0 1011.0 290.0 460.0 551.0 Census Tract 244, Philadelphia County, Pennsyl... 42 101 024400 2024-11
583660 584050 0101000020E61000009638723466CA52C07637567AF002... 0101000020110F0000C30DA979F2EA5FC11BC631F82793... 2024-12-14 16:53:16+00:00 None 244 None NaN NaN None ... 1301.0 1011.0 290.0 460.0 551.0 Census Tract 244, Philadelphia County, Pennsyl... 42 101 024400 2024-12
583661 584051 0101000020E6100000C75E7D144DC552C04F40CC0FA801... 0101000020110F0000F891E19649E25FC1DADDADC3BB91... 2024-06-06 16:05:55+00:00 None 294 None NaN 30.0 None ... 1337.0 1127.0 210.0 496.0 631.0 Census Tract 294, Philadelphia County, Pennsyl... 42 101 029400 2024-06

5 rows × 58 columns

properties_with_tract = properties_with_tract.fillna(0)
C:\Users\Public\Documents\Wondershare\CreatorTemp\ipykernel_37844\2867799837.py:1: DeprecationWarning: ExtensionArray.fillna added a 'copy' keyword in pandas 2.1.0. In a future version, ExtensionArray subclasses will need to implement this keyword or an exception will be raised. In the interim, the keyword is ignored by GeometryArray.
  properties_with_tract = properties_with_tract.fillna(0)
from shapely.wkb import loads as wkb_loads 
if "geometry" not in properties_with_tract.columns:
    if "the_geom" in properties_with_tract.columns:
        print("Converting 'the_geom' to 'geometry'...")
        properties_with_tract = gpd.GeoDataFrame(
            properties_with_tract,
            geometry=properties_with_tract["the_geom"].apply(wkb_loads),
            crs="EPSG:4326" 
        )
    elif "the_geom_webmercator" in properties_with_tract.columns:
        print("Converting 'the_geom_webmercator' to 'geometry'...")
        properties_with_tract = gpd.GeoDataFrame(
            properties_with_tract,
            geometry=properties_with_tract["the_geom_webmercator"].apply(wkb_loads),
            crs="EPSG:3857" 
        )
    else:
        raise ValueError("No valid geometry column ('geometry', 'the_geom', or 'the_geom_webmercator') found!")
Converting 'the_geom' to 'geometry'...
print(final_aggregated_gdf.head())
print(final_aggregated_gdf.crs)
         GEOID year_month  property_count  avg_sale_price  \
0  42101000100    2023-05              59    3.703884e+06   
1  42101000100    2023-08               7    4.441428e+06   
2  42101000100    2023-09               2    5.725000e+06   
3  42101000100    2024-02               3    2.355000e+06   
4  42101000100    2024-03               1    2.025000e+06   

                                            geometry  
0  POLYGON ((-8365905.970 4858674.430, -8365885.3...  
1  POLYGON ((-8365905.970 4858674.430, -8365885.3...  
2  POLYGON ((-8365905.970 4858674.430, -8365885.3...  
3  POLYGON ((-8365905.970 4858674.430, -8365885.3...  
4  POLYGON ((-8365905.970 4858674.430, -8365885.3...  
EPSG:3857
##merely test!!!! I got confused at this point
print("Grouping housing data by GEOID...")
aggregated_df = properties_with_tract.groupby("GEOID", as_index=False).agg({
    "sale_price": "mean",
    "cartodb_id": "count",
})

aggregated_df.rename(
    columns={
        "sale_price": "avg_sale_price",
        "cartodb_id": "property_count"
    },
    inplace=True
)

# merge back
census_gdf["GEOID"] = census_gdf["GEOID"].astype(str)
aggregated_df["GEOID"] = aggregated_df["GEOID"].astype(str)

final_aggregated_data = census_gdf.merge(
    aggregated_df,
    on="GEOID",
    how="left"  # all tracts kept
)

# all data here!
print("Final aggregated data shape:", final_aggregated_data.shape)

print("Final GeoDataFrame created successfully!")
print(final_aggregated_data.head())
print("Geometry type:", final_aggregated_data.geometry.geom_type.value_counts())
Grouping housing data by GEOID...
Final aggregated data shape: (384, 25)
Final GeoDataFrame created successfully!
         GEOID  Total Popu  Median Age  White Alon  Black or A  Asian Alon  \
0  42101009802      6190.0        33.5       580.0      5341.0        35.0   
1  42101037500      3736.0        31.3      1672.0      1467.0       165.0   
2  42101021900      1434.0        44.7      1307.0        50.0        24.0   
3  42101011300      3344.0        28.5        62.0      3229.0         0.0   
4  42101021800      5141.0        30.9      2822.0      1497.0       247.0   

   Hispanic o  Total Hous  Total Fami  Married-co  ...  Vacant Hou  \
0       149.0      2129.0      1467.0       576.0  ...       242.0   
1       298.0      1224.0       733.0       547.0  ...       159.0   
2        41.0       654.0       307.0       238.0  ...        97.0   
3        53.0      1102.0       770.0       133.0  ...       290.0   
4       178.0      2242.0      1073.0       887.0  ...       152.0   

   Owner-occu  Renter-occ                                               NAME  \
0      1593.0       536.0  Census Tract 98.02, Philadelphia County, Penns...   
1       722.0       502.0  Census Tract 375, Philadelphia County, Pennsyl...   
2       551.0       103.0  Census Tract 219, Philadelphia County, Pennsyl...   
3       621.0       481.0  Census Tract 113, Philadelphia County, Pennsyl...   
4       597.0      1645.0  Census Tract 218, Philadelphia County, Pennsyl...   

   state  county   tract                                           geometry  \
0     42     101  009802  POLYGON ((-8379626.770 4862662.870, -8379606.2...   
1     42     101  037500  POLYGON ((-8378518.360 4863306.140, -8378433.4...   
2     42     101  021900  POLYGON ((-8377278.820 4872712.160, -8377218.9...   
3     42     101  011300  POLYGON ((-8375619.820 4863198.200, -8375613.5...   
4     42     101  021800  POLYGON ((-8375464.090 4874647.550, -8375257.5...   

  avg_sale_price property_count  
0  154038.260785           2063  
1  407074.872364            901  
2  171537.744552            826  
3   56054.240363           1323  
4  195453.591292            712  

[5 rows x 25 columns]
Geometry type: Polygon    384
Name: count, dtype: int64

print("Grouping geometry data by GEOID...")
geometry_data = properties_with_tract.groupby("GEOID").geometry.first()


final_aggregated_data = gpd.GeoDataFrame(
    final_aggregated_data,
    geometry=geometry_data,
    crs=properties_with_tract.crs
)

print("Final GeoDataFrame created successfully!")

print("Sample geometries:")
print(properties_with_tract.geometry.head())
print("Geometry type:")
print(properties_with_tract.geometry.geom_type.value_counts())
Grouping geometry data by GEOID...
Final GeoDataFrame created successfully!
Sample geometries:
0    POINT (-75.18187 39.99381)
1    POINT (-75.18190 39.98864)
2    POINT (-75.24368 39.92280)
3    POINT (-75.17936 39.98911)
4    POINT (-75.16592 39.98405)
Name: geometry, dtype: geometry
Geometry type:
Point    583942
Name: count, dtype: int64
geometry_data = geometry_data.reindex(final_aggregated_data["GEOID"]).reset_index(drop=True)

final_aggregated_data = gpd.GeoDataFrame(
    final_aggregated_data,
    geometry=geometry_data,
    crs=properties_with_tract.crs
)
#now lets make a merge to census tract data from different month, and find out price and feature, static data are fixed while I want to see changes across month for dynamic data
from shapely.wkb import loads as wkb_loads

if "geometry" not in properties_with_tract.columns:
    if "the_geom" in properties_with_tract.columns:
        properties_with_tract = gpd.GeoDataFrame(
            properties_with_tract,
            geometry=properties_with_tract["the_geom"].apply(wkb_loads),
            crs="EPSG:3857"
        )
    elif "the_geom_webmercator" in properties_with_tract.columns:
        properties_with_tract = gpd.GeoDataFrame(
            properties_with_tract,
            geometry=properties_with_tract["the_geom_webmercator"].apply(wkb_loads),
            crs="EPSG:3857"
        )
    else:
        raise ValueError("No valid geometry column found!")

geometry_data = properties_with_tract.groupby("GEOID").geometry.first()
print("Geometry data after grouping:")
print(geometry_data.head())
print("Null geometries count:", geometry_data.isnull().sum())

geometry_data = geometry_data.reindex(final_aggregated_data["GEOID"]).reset_index(drop=True)

final_aggregated_data = gpd.GeoDataFrame(
    final_aggregated_data,
    geometry=geometry_data,
    crs=properties_with_tract.crs
)

print("Final aggregated data geometries:")
print(final_aggregated_data.geometry.head())
print("Geometry types:")
print(final_aggregated_data.geometry.geom_type.value_counts())
Geometry data after grouping:
GEOID
42101000100    POINT (-75.14808 39.95194)
42101000200    POINT (-75.16098 39.95747)
42101000300    POINT (-75.16542 39.95504)
42101000401    POINT (-75.17577 39.95263)
42101000402    POINT (-75.16494 39.95148)
Name: geometry, dtype: geometry
Null geometries count: 0
Final aggregated data geometries:
0    POINT (-75.26348 39.97478)
1    POINT (-75.25241 39.98796)
2    POINT (-75.24063 40.04991)
3    POINT (-75.23341 39.97987)
4    POINT (-75.23413 40.05904)
Name: geometry, dtype: geometry
Geometry types:
Point    384
Name: count, dtype: int64
properties_with_tract["property_count"] = 1

aggregated_data = (
    properties_with_tract
    .groupby("GEOID", as_index=False) # no month any more, I previously did by month, removed now
    .agg(
        total_property_count=("property_count", "sum"),
        avg_rooms=("number_of_rooms", "mean"),
        avg_bathrooms=("number_of_bathrooms", "mean"),
        avg_bedrooms=("number_of_bedrooms", "mean"),
        avg_sale_price=("sale_price", "mean"),
        avg_market_value=("market_value", "mean")
    )
)
aggregated_data["avg_sale_price"] = (aggregated_data["avg_sale_price"] / 1_000_000).round(2)
aggregated_data["avg_market_value"] = (aggregated_data["avg_market_value"] / 1_000_000).round(2)

static_columns = [
    "GEOID", "Total Popu", "Median Age", "White Alon", "Black or A", 
    "Asian Alon", "Hispanic o", "Total Hous", "Total Fami", 
    "Married-co", "Poverty St", "Below Pove", "Median Hou", 
    "Occupied H", "Vacant Hou", "Owner-occu", "Renter-occ"
]

static_data = (
    properties_with_tract[static_columns]
    .drop_duplicates(subset="GEOID")
)

final_aggregated_data = aggregated_data.merge(
    static_data, on="GEOID", how="left"
)

census_gdf["GEOID"] = census_gdf["GEOID"].astype(str)
final_aggregated_data["GEOID"] = final_aggregated_data["GEOID"].astype(str)

final_aggregated_data = census_gdf.merge(
    final_aggregated_data,
    on="GEOID",
    how="left"
)

final_aggregated_data = gpd.GeoDataFrame(
    final_aggregated_data,
    geometry=final_aggregated_data.geometry,
    crs=census_gdf.crs
)

print("Final aggregated data geometry type:")
print(final_aggregated_data.geometry.geom_type.value_counts())
print(final_aggregated_data.columns)
#here I encountered some duplicated column names, so I drop them later, u may ask why I not going back? I dont really know what is the source of the problem, so maybe just not touch it
Final aggregated data geometry type:
Polygon    384
Name: count, dtype: int64
Index(['GEOID', 'Total Popu_x', 'Median Age_x', 'White Alon_x', 'Black or A_x',
       'Asian Alon_x', 'Hispanic o_x', 'Total Hous_x', 'Total Fami_x',
       'Married-co_x', 'Poverty St_x', 'Below Pove_x', 'Median Hou_x',
       'Total Ho_1', 'Occupied H_x', 'Vacant Hou_x', 'Owner-occu_x',
       'Renter-occ_x', 'NAME', 'state', 'county', 'tract', 'geometry',
       'total_property_count', 'avg_rooms', 'avg_bathrooms', 'avg_bedrooms',
       'avg_sale_price', 'avg_market_value', 'Total Popu_y', 'Median Age_y',
       'White Alon_y', 'Black or A_y', 'Asian Alon_y', 'Hispanic o_y',
       'Total Hous_y', 'Total Fami_y', 'Married-co_y', 'Poverty St_y',
       'Below Pove_y', 'Median Hou_y', 'Occupied H_y', 'Vacant Hou_y',
       'Owner-occu_y', 'Renter-occ_y'],
      dtype='object')
cols_to_drop = [col for col in final_aggregated_data.columns if col.endswith('_x')]
final_aggregated_data.drop(columns=cols_to_drop, inplace=True)
print(final_aggregated_data.columns)
Index(['GEOID', 'Total Ho_1', 'NAME', 'state', 'county', 'tract', 'geometry',
       'total_property_count', 'avg_rooms', 'avg_bathrooms', 'avg_bedrooms',
       'avg_sale_price', 'avg_market_value', 'Total Popu_y', 'Median Age_y',
       'White Alon_y', 'Black or A_y', 'Asian Alon_y', 'Hispanic o_y',
       'Total Hous_y', 'Total Fami_y', 'Married-co_y', 'Poverty St_y',
       'Below Pove_y', 'Median Hou_y', 'Occupied H_y', 'Vacant Hou_y',
       'Owner-occu_y', 'Renter-occ_y'],
      dtype='object')
print(final_aggregated_data.tail())
           GEOID  Total Ho_1  \
379  42101035602      1284.0   
380  42101035601      2404.0   
381  42101033102      1760.0   
382  42101034801      2006.0   
383  42101036201      2153.0   

                                                  NAME state county   tract  \
379  Census Tract 356.02, Philadelphia County, Penn...    42    101  035602   
380  Census Tract 356.01, Philadelphia County, Penn...    42    101  035601   
381  Census Tract 331.02, Philadelphia County, Penn...    42    101  033102   
382  Census Tract 348.01, Philadelphia County, Penn...    42    101  034801   
383  Census Tract 362.01, Philadelphia County, Penn...    42    101  036201   

                                              geometry  total_property_count  \
379  POLYGON ((-8355008.800 4881749.700, -8354858.9...                  1126   
380  POLYGON ((-8354256.940 4879308.850, -8354227.1...                  2064   
381  POLYGON ((-8352727.080 4872434.270, -8352711.8...                  1410   
382  POLYGON ((-8351724.540 4874706.750, -8351706.9...                   875   
383  POLYGON ((-8348332.630 4877819.050, -8348033.1...                  1563   

     avg_rooms  avg_bathrooms  ...  Total Hous_y  Total Fami_y  Married-co_y  \
379   0.000000       0.267318  ...        1252.0         999.0         834.0   
380   0.008721       0.706880  ...        2155.0        1344.0         964.0   
381   0.111348       0.735461  ...        1556.0         970.0         651.0   
382   0.272000       0.657143  ...        1969.0        1215.0         761.0   
383   0.000000       0.852847  ...        1981.0        1257.0         839.0   

     Poverty St_y  Below Pove_y  Median Hou_y  Occupied H_y  Vacant Hou_y  \
379         999.0           0.0       81912.0        1252.0          32.0   
380        1344.0         172.0       52673.0        2155.0         249.0   
381         970.0         119.0       42692.0        1556.0         204.0   
382        1215.0          34.0       57463.0        1969.0          37.0   
383        1257.0          57.0       74360.0        1981.0         172.0   

     Owner-occu_y  Renter-occ_y  
379        1022.0         230.0  
380        1398.0         757.0  
381         935.0         621.0  
382         922.0        1047.0  
383        1327.0         654.0  

[5 rows x 29 columns]
def fix_missing_rooms(gdf):
    mask_missing_rooms = gdf["avg_rooms"].isna()
    mask_have_bed_bath = (~gdf["avg_bedrooms"].isna()) & (~gdf["avg_bathrooms"].isna())

    gdf.loc[mask_missing_rooms & mask_have_bed_bath, "avg_rooms"] = (
        gdf["avg_bedrooms"] + gdf["avg_bathrooms"] + 0.5
    )
    gdf.loc[mask_missing_rooms & ~mask_have_bed_bath, "avg_rooms"] = 1

    return gdf

final_aggregated_data = fix_missing_rooms(final_aggregated_data)
#now create panel selector
dynamic_cols = [
    'total_property_count', 'avg_rooms', 'avg_bathrooms',
    'avg_bedrooms', 'avg_sale_price', 'avg_market_value'
]

#also let guys see the sale and market value
tooltip_cols = [
    'Total Popu_y', 'Median Age_y','White Alon_y', 'Black or A_y', 'Asian Alon_y',
    'Hispanic o_y','Total Hous_y', 'Total Fami_y', 'Married-co_y','Poverty St_y',
    'Below Pove_y', 'Median Hou_y', 'Occupied H_y', 'Vacant Hou_y',
    'Owner-occu_y', 'Renter-occ_y'
]

final_aggregated_data = final_aggregated_data.to_crs(epsg=4326)

from shapely.geometry import mapping
import folium
import geopandas as gpd
import panel as pn
import altair as alt
import mapclassify
import numpy as np

gdf = final_aggregated_data

###############################
# Function/Legend!
###############################
def get_natural_breaks_bins(data_series, k=8, lower_q=0.01, upper_q=0.99):
    series_no_na = data_series.dropna()
    lower_val = series_no_na.quantile(lower_q)
    upper_val = series_no_na.quantile(upper_q)
    clipped_vals = np.clip(series_no_na, lower_val, upper_val)

    nb = mapclassify.NaturalBreaks(clipped_vals, k=k)
    breaks = nb.bins.tolist()

    min_val = series_no_na.min()
    max_val = series_no_na.max()
    bins = [min_val] + breaks
    bins[-1] = max_val
    return bins

def create_legend_html(bins, legend_name):
    labels = []
    for i in range(len(bins)-1):
        start_val = bins[i]
        end_val = bins[i+1]
        labels.append(f"{start_val:.2f} ~ {end_val:.2f}")

    legend_items = "".join([f"<li>{lbl}</li>" for lbl in labels])
    legend_html = f"""
    <div style="
        position: absolute; 
        z-index:9999; 
        bottom: 80px; 
        right: 10px; 
        background-color: white;
        border:2px solid grey;
        border-radius:5px;
        padding: 10px;
    ">
      <h4 style="margin-top:0">{legend_name}</h4>
      <ul style="list-style:none; padding-left:0; margin:0;">
        {legend_items}
      </ul>
    </div>
    """
    return legend_html

###############################
# Map!
###############################
def create_folium_map(selected_col):
    center_lat = gdf.geometry.centroid.y.mean()
    center_lon = gdf.geometry.centroid.x.mean()

    m = folium.Map(
        location=[center_lat, center_lon],
        zoom_start=11,
        tiles='CartoDB Positron',
        control_scale=True
    )

    geojson_data = gdf.to_json()

    bins = get_natural_breaks_bins(gdf[selected_col], k=8, lower_q=0.01, upper_q=0.99)

    choropleth = folium.Choropleth(
        geo_data=geojson_data,
        data=gdf,
        columns=['GEOID', selected_col],
        key_on='feature.properties.GEOID',
        bins=bins,
        fill_color='YlOrRd',
        fill_opacity=0.7,
        line_opacity=0.2,
        highlight=True,
        legend_name=f'{selected_col} (Natural Breaks)',
    )
    choropleth.add_to(m)

    style_statement = """
    <style>
    .legend {
      display: none !important;
    }
    </style>
    """
    m.get_root().header.add_child(folium.Element(style_statement))

    # Tooltip
    folium.GeoJsonTooltip(
        fields=['GEOID'] + tooltip_cols,
        aliases=['GEOID'] + tooltip_cols,
        localize=True
    ).add_to(choropleth.geojson)

    legend_html = create_legend_html(bins, f"{selected_col} (Natural Breaks)")
    m.get_root().html.add_child(folium.Element(legend_html))

    return m

###############################
# Histo!
###############################
def create_histogram(selected_col):
    df = gdf.copy()
    chart = (
        alt.Chart(df)
        .mark_bar()
        .encode(
            x=alt.X(f"{selected_col}:Q", bin=alt.Bin(maxbins=30), title=f"{selected_col}"),
            y=alt.Y('count()', title='Count')
        )
        .properties(
            width=400,
            height=250,
            title=f"Distribution of {selected_col} (Natural Breaks used on map)"
        )
    )
    return chart

###############################
# 4. Panel
###############################
select_widget = pn.widgets.Select(
    name="Select a column to visualize",
    options=dynamic_cols,
    value=dynamic_cols[0]
)

@pn.depends(select_widget.param.value)
def update_view(selected_col):
    folium_map = create_folium_map(selected_col)
    folium_html = folium_map._repr_html_()
    folium_pane = pn.pane.HTML(folium_html, width=700, height=500)

    hist_chart = create_histogram(selected_col)
    hist_pane = pn.pane.Vega(hist_chart, width=400, height=300)
    return pn.Row(folium_pane, hist_pane)

dashboard = pn.Column(
    "# Census Tract Interactive Dashboard",
    "select indicator to continue:",
    select_widget,
    update_view
)

dashboard.servable()
C:\Users\Public\Documents\Wondershare\CreatorTemp\ipykernel_37844\4061076871.py:59: UserWarning: Geometry is in a geographic CRS. Results from 'centroid' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.

  center_lat = gdf.geometry.centroid.y.mean()
C:\Users\Public\Documents\Wondershare\CreatorTemp\ipykernel_37844\4061076871.py:60: UserWarning: Geometry is in a geographic CRS. Results from 'centroid' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.

  center_lon = gdf.geometry.centroid.x.mean()
C:\Users\44792\miniforge3\envs\musa-550-fall-2023\lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2.
  warnings.warn(
###Now, Question 1, OSL 
crime_gdf = crime_gdf.to_crs(epsg=3857)

census_gdf = census_gdf.to_crs(epsg=3857)

crime_by_tract = gpd.sjoin(
    crime_gdf,     
    census_gdf,   
    how='inner', 
    predicate='intersects'
)

print("crime_by_tract columns:", crime_by_tract.columns)
print("Number of records after sjoin:", len(crime_by_tract))
crime_by_tract columns: Index(['cartodb_id', 'the_geom', 'the_geom_webmercator', 'objectid', 'dc_dist',
       'psa', 'dispatch_date_time', 'dispatch_date', 'dispatch_time', 'hour',
       'dc_key', 'location_block', 'ucr_general', 'text_general_code',
       'point_x', 'point_y', 'geometry', 'year_month', 'crime_category',
       'index_right', 'GEOID', 'Total Popu', 'Median Age', 'White Alon',
       'Black or A', 'Asian Alon', 'Hispanic o', 'Total Hous', 'Total Fami',
       'Married-co', 'Poverty St', 'Below Pove', 'Median Hou', 'Total Ho_1',
       'Occupied H', 'Vacant Hou', 'Owner-occu', 'Renter-occ', 'NAME', 'state',
       'county', 'tract'],
      dtype='object')
Number of records after sjoin: 462145
crime_count_by_tract = crime_by_tract.groupby("GEOID").size().reset_index(name="total_crime_count")
print(crime_count_by_tract.head())
crime_count_by_tract_type = crime_by_tract.groupby(["GEOID","crime_category"]).size().unstack(fill_value=0)
         GEOID  total_crime_count
0  42101000100               2047
1  42101000200               1192
2  42101000300               1946
3  42101000401                810
4  42101000402               2991
census_gdf = census_gdf.merge(crime_count_by_tract, on="GEOID", how="left")

census_gdf["total_crime_count"] = census_gdf["total_crime_count"].fillna(0)

census_gdf.head()
GEOID Total Popu Median Age White Alon Black or A Asian Alon Hispanic o Total Hous Total Fami Married-co ... Occupied H Vacant Hou Owner-occu Renter-occ NAME state county tract geometry total_crime_count
0 42101009802 6190.0 33.5 580.0 5341.0 35.0 149.0 2129.0 1467.0 576.0 ... 2129.0 242.0 1593.0 536.0 Census Tract 98.02, Philadelphia County, Penns... 42 101 009802 POLYGON ((-8379626.770 4862662.870, -8379606.2... 1031
1 42101037500 3736.0 31.3 1672.0 1467.0 165.0 298.0 1224.0 733.0 547.0 ... 1224.0 159.0 722.0 502.0 Census Tract 375, Philadelphia County, Pennsyl... 42 101 037500 POLYGON ((-8378518.360 4863306.140, -8378433.4... 486
2 42101021900 1434.0 44.7 1307.0 50.0 24.0 41.0 654.0 307.0 238.0 ... 654.0 97.0 551.0 103.0 Census Tract 219, Philadelphia County, Pennsyl... 42 101 021900 POLYGON ((-8377278.820 4872712.160, -8377218.9... 167
3 42101011300 3344.0 28.5 62.0 3229.0 0.0 53.0 1102.0 770.0 133.0 ... 1102.0 290.0 621.0 481.0 Census Tract 113, Philadelphia County, Pennsyl... 42 101 011300 POLYGON ((-8375619.820 4863198.200, -8375613.5... 975
4 42101021800 5141.0 30.9 2822.0 1497.0 247.0 178.0 2242.0 1073.0 887.0 ... 2242.0 152.0 597.0 1645.0 Census Tract 218, Philadelphia County, Pennsyl... 42 101 021800 POLYGON ((-8375464.090 4874647.550, -8375257.5... 490

5 rows × 24 columns

census_gdf["white_pop"] = census_gdf["White Alon"]
census_gdf["black_pop"] = census_gdf["Black or A"]
census_gdf["hisp_pop"] = census_gdf["Hispanic o"]

census_gdf["total_pop"] = census_gdf["Total Popu"]

census_gdf["other_pop"] = census_gdf["total_pop"] - (
    census_gdf["white_pop"] + census_gdf["black_pop"] + census_gdf["hisp_pop"]
)

# clip out negative
census_gdf["other_pop"] = census_gdf["other_pop"].clip(lower=0)

# race ratio
census_gdf["white_ratio"] = census_gdf["white_pop"] / census_gdf["total_pop"]
census_gdf["black_ratio"] = census_gdf["black_pop"] / census_gdf["total_pop"]
census_gdf["hisp_ratio"]  = census_gdf["hisp_pop"]  / census_gdf["total_pop"]
census_gdf["other_ratio"] = census_gdf["other_pop"] / census_gdf["total_pop"]

def calc_diversity(row):
    return 1 - (
        (row["white_ratio"]**2) +
        (row["black_ratio"]**2) +
        (row["hisp_ratio"]**2) +
        (row["other_ratio"]**2)
    )

census_gdf["diversity_index"] = census_gdf.apply(calc_diversity, axis=1)

census_gdf.head()
GEOID Total Popu Median Age White Alon Black or A Asian Alon Hispanic o Total Hous Total Fami Married-co ... white_pop black_pop hisp_pop total_pop other_pop white_ratio black_ratio hisp_ratio other_ratio diversity_index
0 42101009802 6190.0 33.5 580.0 5341.0 35.0 149.0 2129.0 1467.0 576.0 ... 580.0 5341.0 149.0 6190.0 120.0 0.093700 0.862843 0.024071 0.019386 0.245767
1 42101037500 3736.0 31.3 1672.0 1467.0 165.0 298.0 1224.0 733.0 547.0 ... 1672.0 1467.0 298.0 3736.0 299.0 0.447537 0.392666 0.079764 0.080032 0.632756
2 42101021900 1434.0 44.7 1307.0 50.0 24.0 41.0 654.0 307.0 238.0 ... 1307.0 50.0 41.0 1434.0 36.0 0.911437 0.034868 0.028591 0.025105 0.166620
3 42101011300 3344.0 28.5 62.0 3229.0 0.0 53.0 1102.0 770.0 133.0 ... 62.0 3229.0 53.0 3344.0 0.0 0.018541 0.965610 0.015849 0.000000 0.067002
4 42101021800 5141.0 30.9 2822.0 1497.0 247.0 178.0 2242.0 1073.0 887.0 ... 2822.0 1497.0 178.0 5141.0 644.0 0.548920 0.291188 0.034624 0.125267 0.597005

5 rows × 34 columns

house_cols = ["GEOID", "avg_sale_price", "avg_rooms", "total_property_count"]
census_gdf = census_gdf.merge(final_aggregated_data[house_cols], on="GEOID", how="left")

census_gdf[["avg_sale_price","avg_rooms","total_property_count"]] = \
    census_gdf[["avg_sale_price","avg_rooms","total_property_count"]].fillna(0)
census_gdf.head()
print(census_gdf.columns)
Index(['GEOID', 'Total Popu', 'Median Age', 'White Alon', 'Black or A',
       'Asian Alon', 'Hispanic o', 'Total Hous', 'Total Fami', 'Married-co',
       'Poverty St', 'Below Pove', 'Median Hou', 'Total Ho_1', 'Occupied H',
       'Vacant Hou', 'Owner-occu', 'Renter-occ', 'NAME', 'state', 'county',
       'tract', 'geometry', 'total_crime_count', 'white_pop', 'black_pop',
       'hisp_pop', 'total_pop', 'other_pop', 'white_ratio', 'black_ratio',
       'hisp_ratio', 'other_ratio', 'diversity_index', 'avg_sale_price',
       'avg_rooms', 'total_property_count'],
      dtype='object')
#now we keep only those columns we think useful and possibly related (tho not all of them may use later)
columns_to_keep = [
    "GEOID",'Total Popu', 'Median Age','Poverty St', 'Below Pove','Married-co',
    "white_ratio", "black_ratio", "hisp_ratio", "other_ratio", "diversity_index",
    "total_crime_count", "avg_sale_price", "avg_rooms", "total_property_count",
    "geometry"
]

final_cols = [col for col in columns_to_keep if col in census_gdf.columns]
census_gdf = census_gdf[final_cols]
census_gdf.head()
GEOID Total Popu Median Age Poverty St Below Pove Married-co white_ratio black_ratio hisp_ratio other_ratio diversity_index total_crime_count avg_sale_price avg_rooms total_property_count geometry
0 42101009802 6190.0 33.5 1467.0 282.0 576.0 0.093700 0.862843 0.024071 0.019386 0.245767 1031 0.15 0.002908 2063 POLYGON ((-8379626.770 4862662.870, -8379606.2...
1 42101037500 3736.0 31.3 733.0 55.0 547.0 0.447537 0.392666 0.079764 0.080032 0.632756 486 0.41 0.022198 901 POLYGON ((-8378518.360 4863306.140, -8378433.4...
2 42101021900 1434.0 44.7 307.0 0.0 238.0 0.911437 0.034868 0.028591 0.025105 0.166620 167 0.17 0.176755 826 POLYGON ((-8377278.820 4872712.160, -8377218.9...
3 42101011300 3344.0 28.5 770.0 333.0 133.0 0.018541 0.965610 0.015849 0.000000 0.067002 975 0.06 0.000000 1323 POLYGON ((-8375619.820 4863198.200, -8375613.5...
4 42101021800 5141.0 30.9 1073.0 92.0 887.0 0.548920 0.291188 0.034624 0.125267 0.597005 490 0.20 0.037921 712 POLYGON ((-8375464.090 4874647.550, -8375257.5...
print(final_aggregated_data.columns)
Index(['GEOID', 'Total Ho_1', 'NAME', 'state', 'county', 'tract', 'geometry',
       'total_property_count', 'avg_rooms', 'avg_bathrooms', 'avg_bedrooms',
       'avg_sale_price', 'avg_market_value', 'Total Popu_y', 'Median Age_y',
       'White Alon_y', 'Black or A_y', 'Asian Alon_y', 'Hispanic o_y',
       'Total Hous_y', 'Total Fami_y', 'Married-co_y', 'Poverty St_y',
       'Below Pove_y', 'Median Hou_y', 'Occupied H_y', 'Vacant Hou_y',
       'Owner-occu_y', 'Renter-occ_y'],
      dtype='object')
census_gdf = census_gdf.merge(
    final_aggregated_data[["GEOID","Total Popu_y"]], 
    on="GEOID", 
    how="left"
)
census_gdf = census_gdf.merge(
    final_aggregated_data[["GEOID",'Median Age_y','Poverty St_y','Below Pove_y','Vacant Hou_y']], 
    on="GEOID", 
    how="left"
)
census_gdf.head()
GEOID Total Popu Median Age Poverty St Below Pove Married-co white_ratio black_ratio hisp_ratio other_ratio ... total_crime_count avg_sale_price avg_rooms total_property_count geometry Total Popu_y Median Age_y Poverty St_y Below Pove_y Vacant Hou_y
0 42101009802 6190.0 33.5 1467.0 282.0 576.0 0.093700 0.862843 0.024071 0.019386 ... 1031 0.15 0.002908 2063 POLYGON ((-8379626.770 4862662.870, -8379606.2... 6190.0 33.5 1467.0 282.0 242.0
1 42101037500 3736.0 31.3 733.0 55.0 547.0 0.447537 0.392666 0.079764 0.080032 ... 486 0.41 0.022198 901 POLYGON ((-8378518.360 4863306.140, -8378433.4... 3736.0 31.3 733.0 55.0 159.0
2 42101021900 1434.0 44.7 307.0 0.0 238.0 0.911437 0.034868 0.028591 0.025105 ... 167 0.17 0.176755 826 POLYGON ((-8377278.820 4872712.160, -8377218.9... 1434.0 44.7 307.0 0.0 97.0
3 42101011300 3344.0 28.5 770.0 333.0 133.0 0.018541 0.965610 0.015849 0.000000 ... 975 0.06 0.000000 1323 POLYGON ((-8375619.820 4863198.200, -8375613.5... 3344.0 28.5 770.0 333.0 290.0
4 42101021800 5141.0 30.9 1073.0 92.0 887.0 0.548920 0.291188 0.034624 0.125267 ... 490 0.20 0.037921 712 POLYGON ((-8375464.090 4874647.550, -8375257.5... 5141.0 30.9 1073.0 92.0 152.0

5 rows × 21 columns

census_gdf["crime_rate"] = (census_gdf["total_crime_count"] / census_gdf["Total Popu_y"]) * 1000
census_gdf["crime_rate"] = census_gdf["crime_rate"].fillna(0).replace([np.inf, -np.inf], 0)
census_gdf = census_gdf[ (census_gdf["Total Popu_y"] > 0) & (census_gdf["crime_rate"].notnull()) ]

# Dependent V
y = census_gdf["crime_rate"]

# independent V
X = census_gdf[["black_ratio", "white_ratio", "avg_sale_price"]]
import statsmodels.api as sm

# I manually add the intercept here, it is constant for all variable so should have no infuence
X = sm.add_constant(X)
X.head()
const black_ratio white_ratio avg_sale_price
0 1.0 0.862843 0.093700 0.15
1 1.0 0.392666 0.447537 0.41
2 1.0 0.034868 0.911437 0.17
3 1.0 0.965610 0.018541 0.06
4 1.0 0.291188 0.548920 0.20
model = sm.OLS(y, X, missing='drop')
results = model.fit()
print(results.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:             crime_rate   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                 -0.005
Method:                 Least Squares   F-statistic:                    0.3550
Date:                Thu, 26 Dec 2024   Prob (F-statistic):              0.786
Time:                        14:17:29   Log-Likelihood:                -3156.2
No. Observations:                 376   AIC:                             6320.
Df Residuals:                     372   BIC:                             6336.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==================================================================================
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const            574.9899    238.005      2.416      0.016     106.987    1042.993
black_ratio     -267.0188    295.893     -0.902      0.367    -848.852     314.814
white_ratio     -236.9274    330.994     -0.716      0.475    -887.782     413.927
avg_sale_price     3.9940      9.458      0.422      0.673     -14.603      22.591
==============================================================================
Omnibus:                      810.129   Durbin-Watson:                   1.978
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          1125693.675
Skew:                          15.633   Prob(JB):                         0.00
Kurtosis:                     269.224   Cond. No.                         51.8
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
from statsmodels.stats.outliers_influence import variance_inflation_factor

# X_no_const = X.drop(columns="const", errors="ignore") # statsmodels add_constant() 可能会加const
# We do:
X_no_const = X.loc[:, X.columns != 'const']

vif_data = pd.DataFrame()
vif_data["feature"] = X_no_const.columns
vif_data["VIF"] = [variance_inflation_factor(X_no_const.values, i) for i in range(X_no_const.shape[1])]

print(vif_data)
          feature       VIF
0     black_ratio  1.064220
1     white_ratio  1.072523
2  avg_sale_price  1.011484
import matplotlib.pyplot as plt

y_pred = results.predict(X)
plt.scatter(y, y_pred, alpha=0.5)
plt.xlabel("Actual Crime Rate")
plt.ylabel("Predicted Crime Rate")
plt.title("OLS Fit: Actual vs. Predicted")
plt.show()

census_gdf["resid"] = results.resid

#lets do some improvement
census_gdf["Median Age_y"] = census_gdf["Median Age_y"].fillna(0)
census_gdf["Poverty St_y"] = census_gdf["Poverty St_y"].fillna(0)
census_gdf["Below Pove_y"] = census_gdf["Below Pove_y"].fillna(0)
census_gdf["Vacant Hou_y"] = census_gdf["Vacant Hou_y"].fillna(0)

census_gdf["below_poverty_ratio"] = (
    census_gdf["Below Pove_y"] / census_gdf["Total Popu_y"]
).replace([np.inf, -np.inf], 0).fillna(0)
feature_cols = [
    "black_ratio",
    "white_ratio",
    "avg_sale_price", 
    "Median Age_y",
    "below_poverty_ratio",
    "Vacant Hou_y"
]
import statsmodels.api as sm

# same y
y = census_gdf["crime_rate"]

# new independent
X = census_gdf[feature_cols].copy()
X = X.fillna(0)

# make a log to price to avoid extreme values
X["avg_sale_price"] = np.log1p(X["avg_sale_price"])

X = sm.add_constant(X)

model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

from statsmodels.stats.outliers_influence import variance_inflation_factor

X_no_const = X.drop(columns=["const"], errors="ignore")
vif_data = pd.DataFrame()
vif_data["feature"] = X_no_const.columns
vif_data["VIF"] = [
    variance_inflation_factor(X_no_const.values, i) 
    for i in range(X_no_const.shape[1])
]
print(vif_data)
                            OLS Regression Results                            
==============================================================================
Dep. Variable:             crime_rate   R-squared:                       0.050
Model:                            OLS   Adj. R-squared:                  0.035
Method:                 Least Squares   F-statistic:                     3.237
Date:                Thu, 26 Dec 2024   Prob (F-statistic):            0.00411
Time:                        14:17:29   Log-Likelihood:                -3147.1
No. Observations:                 376   AIC:                             6308.
Df Residuals:                     369   BIC:                             6336.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
=======================================================================================
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                -582.6870    429.256     -1.357      0.175   -1426.783     261.409
black_ratio           189.6411    326.899      0.580      0.562    -453.178     832.460
white_ratio           667.2105    427.683      1.560      0.120    -173.791    1508.212
avg_sale_price        281.4062    149.743      1.879      0.061     -13.051     575.863
Median Age_y            8.4286      8.189      1.029      0.304      -7.674      24.531
below_poverty_ratio  1.151e+04   2816.750      4.086      0.000    5971.367     1.7e+04
Vacant Hou_y           -0.6743      0.421     -1.600      0.110      -1.503       0.154
==============================================================================
Omnibus:                      781.262   Durbin-Watson:                   1.971
Prob(Omnibus):                  0.000   Jarque-Bera (JB):           896403.493
Skew:                          14.465   Prob(JB):                         0.00
Kurtosis:                     240.445   Cond. No.                     1.41e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.41e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
               feature        VIF
0          black_ratio   8.066012
1          white_ratio   8.710505
2       avg_sale_price   1.488647
3         Median Age_y  19.326683
4  below_poverty_ratio   3.727617
5         Vacant Hou_y   4.233158
#What a pity, basically we cant really say the crime is related with sales prices and race yet, but it does highly realtes with poverty rate, lets try delete highly VIF factor to see if there are improvements
feature_cols = [
    "black_ratio",       
    "white_ratio",       
    "avg_sale_price",    
    "Median Age_y",      # VIF≈19
    "below_poverty_ratio",
    "Vacant Hou_y"
]

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

df = census_gdf.copy()

X = df[feature_cols].fillna(0).copy()

X = sm.add_constant(X)

X_no_const = X.drop(columns=["const"], errors="ignore")

vif_data = pd.DataFrame()
vif_data["feature"] = X_no_const.columns
vif_data["VIF"] = [
    variance_inflation_factor(X_no_const.values, i) for i in range(X_no_const.shape[1])
]
print("Initial VIF:\n", vif_data)
Initial VIF:
                feature        VIF
0          black_ratio   8.124570
1          white_ratio   8.672783
2       avg_sale_price   1.040092
3         Median Age_y  19.298617
4  below_poverty_ratio   3.655698
5         Vacant Hou_y   4.147828
feature_cols_reduced = [
    "black_ratio",
    "white_ratio",
    # "Median Age_y",  # NO MORE!
    "avg_sale_price",
    "below_poverty_ratio",
    "Vacant Hou_y"
]

X2 = df[feature_cols_reduced].fillna(0).copy()

X2 = sm.add_constant(X2)
X2_no_const = X2.drop(columns=["const"], errors="ignore")

vif_data2 = pd.DataFrame()
vif_data2["feature"] = X2_no_const.columns
vif_data2["VIF"] = [
    variance_inflation_factor(X2_no_const.values, i) for i in range(X2_no_const.shape[1])
]
print("VIF after dropping Median Age_y:\n", vif_data2)
VIF after dropping Median Age_y:
                feature       VIF
0          black_ratio  3.544943
1          white_ratio  1.481318
2       avg_sale_price  1.034620
3  below_poverty_ratio  3.007876
4         Vacant Hou_y  4.123210
X_final = df[feature_cols_reduced].fillna(0).copy()

X_final["avg_sale_price"] = np.log1p(X_final["avg_sale_price"])

X_final = sm.add_constant(X_final)

y = df["crime_rate"].fillna(0)

model = sm.OLS(y, X_final, missing='drop')
results = model.fit()
print(results.summary())

X_no_const_final = X_final.drop(columns=["const"], errors="ignore")
vif_data_final = pd.DataFrame({
    "feature": X_no_const_final.columns,
    "VIF": [
        variance_inflation_factor(X_no_const_final.values, i)
        for i in range(X_no_const_final.shape[1])
    ]
})
print("Final VIF after removing high-collinearity vars:\n", vif_data_final)
                            OLS Regression Results                            
==============================================================================
Dep. Variable:             crime_rate   R-squared:                       0.047
Model:                            OLS   Adj. R-squared:                  0.034
Method:                 Least Squares   F-statistic:                     3.672
Date:                Thu, 26 Dec 2024   Prob (F-statistic):            0.00296
Time:                        14:17:29   Log-Likelihood:                -3147.6
No. Observations:                 376   AIC:                             6307.
Df Residuals:                     370   BIC:                             6331.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
=======================================================================================
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                -315.5345    341.930     -0.923      0.357    -987.904     356.835
black_ratio           267.0040    318.167      0.839      0.402    -358.639     892.647
white_ratio           760.4843    418.006      1.819      0.070     -61.481    1582.450
avg_sale_price        268.6425    149.241      1.800      0.073     -24.824     562.109
below_poverty_ratio  1.119e+04   2800.248      3.998      0.000    5688.368    1.67e+04
Vacant Hou_y           -0.7413      0.416     -1.781      0.076      -1.560       0.077
==============================================================================
Omnibus:                      785.338   Durbin-Watson:                   1.975
Prob(Omnibus):                  0.000   Jarque-Bera (JB):           930284.806
Skew:                          14.623   Prob(JB):                         0.00
Kurtosis:                     244.919   Cond. No.                     1.40e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.4e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
Final VIF after removing high-collinearity vars:
                feature       VIF
0          black_ratio  3.537142
1          white_ratio  1.717726
2       avg_sale_price  1.478665
3  below_poverty_ratio  3.057857
4         Vacant Hou_y  4.204546
#it seems like housing price are far not the most important factor when influencing housing prices
#Q2,Here we test the relateness of each category relative to saling and market price
import geopandas as gpd
import statsmodels.api as sm

shp_path = "final/Philadelphia_ACS_2023.shp"

try:
    tracts_gdf = gpd.read_file(shp_path)
    print(f"Loaded {len(tracts_gdf)} Census Tracts from {shp_path}")
except FileNotFoundError:
    raise FileNotFoundError(f"The file at {shp_path} does not exist. Please check the path.")

if tracts_gdf.crs != "EPSG:4326":
    tracts_gdf = tracts_gdf.to_crs(epsg=4326)

tracts_gdf.head()
Loaded 384 Census Tracts from final/Philadelphia_ACS_2023.shp
GEOID Total Popu Median Age White Alon Black or A Asian Alon Hispanic o Total Hous Total Fami Married-co ... Total Ho_1 Occupied H Vacant Hou Owner-occu Renter-occ NAME state county tract geometry
0 42101009802 6190.0 33.5 580.0 5341.0 35.0 149.0 2129.0 1467.0 576.0 ... 2371.0 2129.0 242.0 1593.0 536.0 Census Tract 98.02, Philadelphia County, Penns... 42 101 009802 POLYGON ((-75.27547 39.97743, -75.27528 39.977...
1 42101037500 3736.0 31.3 1672.0 1467.0 165.0 298.0 1224.0 733.0 547.0 ... 1383.0 1224.0 159.0 722.0 502.0 Census Tract 375, Philadelphia County, Pennsyl... 42 101 037500 POLYGON ((-75.26551 39.98186, -75.26475 39.982...
2 42101021900 1434.0 44.7 1307.0 50.0 24.0 41.0 654.0 307.0 238.0 ... 751.0 654.0 97.0 551.0 103.0 Census Tract 219, Philadelphia County, Pennsyl... 42 101 021900 POLYGON ((-75.25438 40.04657, -75.25384 40.046...
3 42101011300 3344.0 28.5 62.0 3229.0 0.0 53.0 1102.0 770.0 133.0 ... 1392.0 1102.0 290.0 621.0 481.0 Census Tract 113, Philadelphia County, Pennsyl... 42 101 011300 POLYGON ((-75.23947 39.98111, -75.23942 39.981...
4 42101021800 5141.0 30.9 2822.0 1497.0 247.0 178.0 2242.0 1073.0 887.0 ... 2394.0 2242.0 152.0 597.0 1645.0 Census Tract 218, Philadelphia County, Pennsyl... 42 101 021800 POLYGON ((-75.23807 40.05988, -75.23622 40.061...

5 rows × 23 columns

print("Number of crime records:", len(crime_gdf))
crime_gdf.head()

crime_gdf = crime_gdf.to_crs(epsg=3857)
tracts_gdf = tracts_gdf.to_crs(epsg=3857)

# 1) sjoin
crime_by_tract = gpd.sjoin(
    crime_gdf, 
    tracts_gdf, 
    how="inner", 
    predicate="intersects"
).copy()

print("Records after sjoin:", len(crime_by_tract))
crime_by_tract.head()
print(crime_by_tract.columns)
Number of crime records: 462960
Records after sjoin: 462145
Index(['cartodb_id', 'the_geom', 'the_geom_webmercator', 'objectid', 'dc_dist',
       'psa', 'dispatch_date_time', 'dispatch_date', 'dispatch_time', 'hour',
       'dc_key', 'location_block', 'ucr_general', 'text_general_code',
       'point_x', 'point_y', 'geometry', 'year_month', 'crime_category',
       'index_right', 'GEOID', 'Total Popu', 'Median Age', 'White Alon',
       'Black or A', 'Asian Alon', 'Hispanic o', 'Total Hous', 'Total Fami',
       'Married-co', 'Poverty St', 'Below Pove', 'Median Hou', 'Total Ho_1',
       'Occupied H', 'Vacant Hou', 'Owner-occu', 'Renter-occ', 'NAME', 'state',
       'county', 'tract'],
      dtype='object')
crime_counts = (crime_by_tract
    .groupby(["GEOID","crime_category"])
    .size()
    .reset_index(name="count")
)

crime_counts.head()
GEOID crime_category count
0 42101000100 Drug/Alcohol Crime 16
1 42101000100 Miscellaneous 419
2 42101000100 Other 274
3 42101000100 Property Crime 1192
4 42101000100 Sexual Offenses 8
crime_counts_wide = crime_counts.pivot_table(
    index="GEOID",
    columns="crime_category",
    values="count",
    fill_value=0
).reset_index()

crime_counts_wide.columns.name = None
crime_counts_wide.head()
GEOID Drug/Alcohol Crime Miscellaneous Other Property Crime Sexual Offenses Violent Crime
0 42101000100 16.0 419.0 274.0 1192.0 8.0 138.0
1 42101000200 12.0 165.0 174.0 711.0 24.0 106.0
2 42101000300 11.0 413.0 433.0 942.0 19.0 128.0
3 42101000401 3.0 83.0 94.0 579.0 3.0 48.0
4 42101000402 40.0 601.0 509.0 1497.0 46.0 298.0
print("final_aggregated_data columns:\n", final_aggregated_data.columns)
print("Length of final_aggregated_data:", len(final_aggregated_data))

merged_df = crime_counts_wide.merge(
    final_aggregated_data, 
    on="GEOID",
    how="left"
)

merged_df = merged_df.fillna(0)

print("Merged shape:", merged_df.shape)
merged_df.head()
final_aggregated_data columns:
 Index(['GEOID', 'Total Ho_1', 'NAME', 'state', 'county', 'tract', 'geometry',
       'total_property_count', 'avg_rooms', 'avg_bathrooms', 'avg_bedrooms',
       'avg_sale_price', 'avg_market_value', 'Total Popu_y', 'Median Age_y',
       'White Alon_y', 'Black or A_y', 'Asian Alon_y', 'Hispanic o_y',
       'Total Hous_y', 'Total Fami_y', 'Married-co_y', 'Poverty St_y',
       'Below Pove_y', 'Median Hou_y', 'Occupied H_y', 'Vacant Hou_y',
       'Owner-occu_y', 'Renter-occ_y'],
      dtype='object')
Length of final_aggregated_data: 384
Merged shape: (384, 35)
C:\Users\Public\Documents\Wondershare\CreatorTemp\ipykernel_37844\2015751945.py:10: DeprecationWarning: ExtensionArray.fillna added a 'copy' keyword in pandas 2.1.0. In a future version, ExtensionArray subclasses will need to implement this keyword or an exception will be raised. In the interim, the keyword is ignored by GeometryArray.
  merged_df = merged_df.fillna(0)
GEOID Drug/Alcohol Crime Miscellaneous Other Property Crime Sexual Offenses Violent Crime Total Ho_1 NAME state ... Total Hous_y Total Fami_y Married-co_y Poverty St_y Below Pove_y Median Hou_y Occupied H_y Vacant Hou_y Owner-occu_y Renter-occ_y
0 42101000100 16.0 419.0 274.0 1192.0 8.0 138.0 2827.0 Census Tract 1, Philadelphia County, Pennsylvania 42 ... 2489.0 753.0 621.0 753.0 14.0 103585.0 2489.0 338.0 901.0 1588.0
1 42101000200 12.0 165.0 174.0 711.0 24.0 106.0 1219.0 Census Tract 2, Philadelphia County, Pennsylvania 42 ... 1082.0 503.0 402.0 503.0 55.0 49871.0 1082.0 137.0 523.0 559.0
2 42101000300 11.0 413.0 433.0 942.0 19.0 128.0 2063.0 Census Tract 3, Philadelphia County, Pennsylvania 42 ... 1828.0 588.0 516.0 588.0 16.0 86296.0 1828.0 235.0 441.0 1387.0
3 42101000401 3.0 83.0 94.0 579.0 3.0 48.0 1973.0 Census Tract 4.01, Philadelphia County, Pennsy... 42 ... 1618.0 437.0 414.0 437.0 50.0 62986.0 1618.0 355.0 491.0 1127.0
4 42101000402 40.0 601.0 509.0 1497.0 46.0 298.0 3448.0 Census Tract 4.02, Philadelphia County, Pennsy... 42 ... 2918.0 672.0 601.0 672.0 0.0 78947.0 2918.0 530.0 1679.0 1239.0

5 rows × 35 columns

def run_regression_crime_type(df, crime_col, housing_cols):
    """
    df: merged dataframe
    crime_col: e.g. "Violent Crime", "Property Crime", ...
    housing_cols: list of columns to use as X
    """
  
    y = df[crime_col].values
    
   
    X = df[housing_cols].copy()
    
    X = sm.add_constant(X, has_constant='add')
    
    model = sm.OLS(y, X, missing='drop')
    results = model.fit()
    print(f"Regression on {crime_col} ~ {housing_cols}")
    print(results.summary())
    print("-"*80)
    
    return results
    
housing_features = ["avg_sale_price", "avg_rooms", "total_property_count"]
crime_types = [
    "Violent Crime",
    "Property Crime",
    "Drug/Alcohol Crime",
    "Sexual Offenses",
    "Miscellaneous"
]

results_dict = {}

for ctype in crime_types:
    if ctype in merged_df.columns:
        results = run_regression_crime_type(
            merged_df, 
            ctype, 
            housing_features
        )
        results_dict[ctype] = results
    else:
        print(f"Column {ctype} not in merged_df, skipping...")
Regression on Violent Crime ~ ['avg_sale_price', 'avg_rooms', 'total_property_count']
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.205
Model:                            OLS   Adj. R-squared:                  0.199
Method:                 Least Squares   F-statistic:                     32.64
Date:                Thu, 26 Dec 2024   Prob (F-statistic):           8.60e-19
Time:                        14:17:33   Log-Likelihood:                -2209.9
No. Observations:                 384   AIC:                             4428.
Df Residuals:                     380   BIC:                             4444.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   43.7517      9.639      4.539      0.000      24.799      62.704
avg_sale_price          -0.2682      0.647     -0.415      0.679      -1.540       1.003
avg_rooms              -55.5946     12.919     -4.303      0.000     -80.997     -30.192
total_property_count     0.0463      0.005      8.456      0.000       0.036       0.057
==============================================================================
Omnibus:                      120.879   Durbin-Watson:                   0.782
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              399.855
Skew:                           1.411   Prob(JB):                     1.49e-87
Kurtosis:                       7.127   Cond. No.                     5.77e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.77e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
--------------------------------------------------------------------------------
Regression on Property Crime ~ ['avg_sale_price', 'avg_rooms', 'total_property_count']
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.084
Model:                            OLS   Adj. R-squared:                  0.077
Method:                 Least Squares   F-statistic:                     11.67
Date:                Thu, 26 Dec 2024   Prob (F-statistic):           2.48e-07
Time:                        14:17:33   Log-Likelihood:                -2852.2
No. Observations:                 384   AIC:                             5712.
Df Residuals:                     380   BIC:                             5728.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                  481.3326     51.328      9.378      0.000     380.410     582.255
avg_sale_price           0.2273      3.444      0.066      0.947      -6.544       6.999
avg_rooms             -200.7619     68.794     -2.918      0.004    -336.027     -65.497
total_property_count     0.1431      0.029      4.909      0.000       0.086       0.200
==============================================================================
Omnibus:                      254.406   Durbin-Watson:                   1.620
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2874.925
Skew:                           2.688   Prob(JB):                         0.00
Kurtosis:                      15.280   Cond. No.                     5.77e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.77e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
--------------------------------------------------------------------------------
Regression on Drug/Alcohol Crime ~ ['avg_sale_price', 'avg_rooms', 'total_property_count']
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.040
Model:                            OLS   Adj. R-squared:                  0.033
Method:                 Least Squares   F-statistic:                     5.304
Date:                Thu, 26 Dec 2024   Prob (F-statistic):            0.00136
Time:                        14:17:33   Log-Likelihood:                -2257.8
No. Observations:                 384   AIC:                             4524.
Df Residuals:                     380   BIC:                             4539.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   -1.2540     10.919     -0.115      0.909     -22.724      20.216
avg_sale_price           0.0067      0.733      0.009      0.993      -1.434       1.447
avg_rooms              -20.9467     14.635     -1.431      0.153     -49.723       7.829
total_property_count     0.0222      0.006      3.584      0.000       0.010       0.034
==============================================================================
Omnibus:                      615.491   Durbin-Watson:                   0.679
Prob(Omnibus):                  0.000   Jarque-Bera (JB):           142748.835
Skew:                           8.928   Prob(JB):                         0.00
Kurtosis:                      95.752   Cond. No.                     5.77e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.77e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
--------------------------------------------------------------------------------
Regression on Sexual Offenses ~ ['avg_sale_price', 'avg_rooms', 'total_property_count']
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.025
Model:                            OLS   Adj. R-squared:                  0.017
Method:                 Least Squares   F-statistic:                     3.202
Date:                Thu, 26 Dec 2024   Prob (F-statistic):             0.0233
Time:                        14:17:33   Log-Likelihood:                -1866.6
No. Observations:                 384   AIC:                             3741.
Df Residuals:                     380   BIC:                             3757.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                    8.0772      3.942      2.049      0.041       0.326      15.829
avg_sale_price          -0.0336      0.265     -0.127      0.899      -0.554       0.487
avg_rooms              -10.2522      5.284     -1.940      0.053     -20.642       0.137
total_property_count     0.0050      0.002      2.249      0.025       0.001       0.009
==============================================================================
Omnibus:                      806.172   Durbin-Watson:                   1.691
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          1063695.011
Skew:                          14.805   Prob(JB):                         0.00
Kurtosis:                     259.133   Cond. No.                     5.77e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.77e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
--------------------------------------------------------------------------------
Regression on Miscellaneous ~ ['avg_sale_price', 'avg_rooms', 'total_property_count']
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.167
Model:                            OLS   Adj. R-squared:                  0.160
Method:                 Least Squares   F-statistic:                     25.37
Date:                Thu, 26 Dec 2024   Prob (F-statistic):           5.58e-15
Time:                        14:17:33   Log-Likelihood:                -2421.9
No. Observations:                 384   AIC:                             4852.
Df Residuals:                     380   BIC:                             4868.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   90.9089     16.741      5.430      0.000      57.992     123.826
avg_sale_price          -0.4549      1.123     -0.405      0.686      -2.664       1.754
avg_rooms              -75.0119     22.438     -3.343      0.001    -119.130     -30.894
total_property_count     0.0729      0.010      7.670      0.000       0.054       0.092
==============================================================================
Omnibus:                      183.690   Durbin-Watson:                   1.059
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              965.740
Skew:                           2.029   Prob(JB):                    1.96e-210
Kurtosis:                       9.625   Cond. No.                     5.77e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.77e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
--------------------------------------------------------------------------------
crime_types = ["Violent Crime","Property Crime","Drug/Alcohol Crime","Sexual Offenses","Miscellaneous"]

fig, axes = plt.subplots(nrows=1, ncols=5, figsize=(20, 4), sharey=False)

for i, ctype in enumerate(crime_types):
    ax = axes[i]
    ax.scatter(merged_df["avg_sale_price"], merged_df[ctype], alpha=0.5)
    ax.set_xlabel("Avg Sale Price")
    ax.set_ylabel(ctype)
    ax.set_title(f"{ctype} vs. Avg Sale Price")

plt.tight_layout()
plt.show()

plt.figure(figsize=(8, 6))

colors = ["red","blue","green","orange","purple"]
crime_types = ["Violent Crime","Property Crime","Drug/Alcohol Crime","Sexual Offenses","Miscellaneous"]

for ctype, color in zip(crime_types, colors):
    plt.scatter(merged_df["avg_sale_price"], merged_df[ctype], alpha=0.5, color=color, label=ctype)

plt.xlabel("Avg Sale Price")
plt.ylabel("Crime Count")
plt.title("Crime Count vs. Avg Sale Price (All Types in One Plot)")
plt.legend()
plt.show()

#Q3 
#lets have a peek of the data structure and data content
print("Columns in crime_gdf:", crime_gdf.columns.tolist())

print(crime_gdf.head())
Columns in crime_gdf: ['cartodb_id', 'the_geom', 'the_geom_webmercator', 'objectid', 'dc_dist', 'psa', 'dispatch_date_time', 'dispatch_date', 'dispatch_time', 'hour', 'dc_key', 'location_block', 'ucr_general', 'text_general_code', 'point_x', 'point_y', 'geometry', 'year_month', 'crime_category']
   cartodb_id                                           the_geom  \
0           2  0101000020E6100000A51C8299A5C752C006342AD3DCFF...   
1           4  0101000020E6100000F9245E3B64CC52C0B7195D940FF6...   
2           7  0101000020E6100000118A52E7F6C052C0CFF41263190C...   
3         123  0101000020E6100000E1F9FB7B5FC552C0159C0B6D4A02...   
4         126  0101000020E6100000D1CCD5875CCA52C014B723FFC005...   

                                the_geom_webmercator  objectid dc_dist psa  \
0  0101000020110F0000F80DE2A145E65FC1E5EC7592BE8F...       114      25   3   
1  0101000020110F00000426B7CE54EE5FC1C5E06D37E284...       116      01   1   
2  0101000020110F00006728CED7EBDA5FC169DB64F8519D...       119      08   2   
3  0101000020110F00009D28D4D968E25FC13CD5C3D06F92...        96      15   1   
4  0101000020110F00002F28E30AE2EA5FC10090A3314796...        99      14   1   

     dispatch_date_time dispatch_date dispatch_time  hour        dc_key  \
0  2023-03-11T17:12:00Z    2023-03-11      12:12:00  12.0  202325014482   
1  2023-03-11T18:31:00Z    2023-03-11      13:31:00  13.0  202301004597   
2  2023-03-11T22:13:00Z    2023-03-11      17:13:00  17.0  202308008412   
3  2023-03-11T12:42:00Z    2023-03-11      07:42:00   7.0  202315017366   
4  2023-03-12T00:54:00Z    2023-03-11      19:54:00  19.0  202314012625   

              location_block ucr_general   text_general_code    point_x  \
0    3300 BLOCK HARTVILLE ST         300  Robbery No Firearm -75.119482   
1       2400 BLOCK S 28TH ST         600  Theft from Vehicle -75.193618   
2  9800 BLOCK Roosevelt Blvd         600              Thefts -75.015070   
3      4700 BLOCK GRISCOM ST         600              Thefts -75.083953   
4        5500 BLOCK BLOYD ST         300  Robbery No Firearm -75.161898   

     point_y                          geometry year_month  crime_category  
0  39.998927  POINT (-8362262.529 4865786.288)    2023-03   Violent Crime  
1  39.922350  POINT (-8370515.230 4854664.866)    2023-03  Property Crime  
2  40.094525  POINT (-8350639.372 4879687.881)    2023-03  Property Crime  
3  40.017896  POINT (-8358307.404 4868543.262)    2023-03  Property Crime  
4  40.044952  POINT (-8366984.170 4872476.776)    2023-03   Violent Crime  
unique_cats = crime_gdf["crime_category"].unique()
print("Unique crime categories in crime_gdf:", unique_cats)

missing_cats = set(["Violent Crime","Property Crime","Drug/Alcohol Crime","Sexual Offenses","Miscellaneous"]) - set(unique_cats)
if missing_cats:
    print("Warning: These categories not found in data:", missing_cats)
else:
    print("All 5 categories are present!")
Unique crime categories in crime_gdf: ['Violent Crime' 'Property Crime' 'Sexual Offenses' 'Other'
 'Drug/Alcohol Crime' 'Miscellaneous']
All 5 categories are present!
# convert type
crime_gdf["year_month"] = crime_gdf["dispatch_date"].dt.to_period("M").astype(str)

crime_gdf["day_of_week"] = crime_gdf["dispatch_date"].dt.dayofweek

# hour (which does not exist)
crime_gdf["hour_of_day"] = crime_gdf["dispatch_date"].dt.hour

# days
print("Unique day_of_week:", sorted(crime_gdf["day_of_week"].unique()))
print("Unique hour_of_day:", sorted(crime_gdf["hour_of_day"].unique()))

print("Total records in crime_gdf:", len(crime_gdf))

missing_date = crime_gdf["dispatch_date"].isna().sum()
print("Missing dispatch_date rows:", missing_date)

missing_cat = crime_gdf["crime_category"].isna().sum()
print("Missing crime_category rows:", missing_cat)

group_month_cat = crime_gdf.groupby(["year_month","crime_category"]).size().reset_index(name="count")
print("First few lines of month-cat pivot:")
print(group_month_cat.head(10).to_string())

# Month
unique_months = group_month_cat["year_month"].unique()
print(f"Found {len(unique_months)} distinct year_month values.")
Unique day_of_week: [0, 1, 2, 3, 4, 5, 6]
Unique hour_of_day: [0]
Total records in crime_gdf: 462960
Missing dispatch_date rows: 0
Missing crime_category rows: 0
First few lines of month-cat pivot:
  year_month      crime_category  count
0    2022-01  Drug/Alcohol Crime    257
1    2022-01       Miscellaneous   1542
2    2022-01               Other   1752
3    2022-01      Property Crime   5608
4    2022-01     Sexual Offenses    123
5    2022-01       Violent Crime   1234
6    2022-02  Drug/Alcohol Crime    335
7    2022-02       Miscellaneous   1748
8    2022-02               Other   1840
9    2022-02      Property Crime   5295
Found 36 distinct year_month values.
group_dow_cat = crime_gdf.groupby(["day_of_week","crime_category"]).size().reset_index(name="count")
print(group_dow_cat.head(14))

group_hour_cat = crime_gdf.groupby(["hour_of_day","crime_category"]).size().reset_index(name="count")
print(group_hour_cat.head(10))

group_hour_cat = crime_gdf.groupby(["hour_of_day","crime_category"]).size().reset_index(name="count")
print(group_hour_cat.head(10))
    day_of_week      crime_category  count
0             0  Drug/Alcohol Crime   1184
1             0       Miscellaneous  10568
2             0               Other  11818
3             0      Property Crime  41281
4             0     Sexual Offenses    806
5             0       Violent Crime   6115
6             1  Drug/Alcohol Crime   1830
7             1       Miscellaneous  12314
8             1               Other  11492
9             1      Property Crime  39193
10            1     Sexual Offenses    853
11            1       Violent Crime   5773
12            2  Drug/Alcohol Crime   2020
13            2       Miscellaneous  12228
   hour_of_day      crime_category   count
0            0  Drug/Alcohol Crime   11239
1            0       Miscellaneous   72893
2            0               Other   76261
3            0      Property Crime  256726
4            0     Sexual Offenses    5416
5            0       Violent Crime   40425
   hour_of_day      crime_category   count
0            0  Drug/Alcohol Crime   11239
1            0       Miscellaneous   72893
2            0               Other   76261
3            0      Property Crime  256726
4            0     Sexual Offenses    5416
5            0       Violent Crime   40425
#good! It seems we have month and week level data, tho not hours, but this is enough to investigate
monthly_counts = crime_gdf.groupby(["year_month","crime_category"]).size().reset_index(name="count")

weekly_counts = crime_gdf.groupby(["day_of_week","crime_category"]).size().reset_index(name="count")
import altair as alt

def create_time_charts(selected_category):
    if selected_category == "All":
        df_month = monthly_counts.groupby("year_month")["count"].sum().reset_index()
        df_week = weekly_counts.groupby("day_of_week")["count"].sum().reset_index()
    else:
        df_month = monthly_counts[monthly_counts["crime_category"]==selected_category]
        df_week = weekly_counts[weekly_counts["crime_category"]==selected_category]

    month_chart = (
        alt.Chart(df_month)
        .mark_line(point=True)
        .encode(
            x=alt.X("year_month:N", sort=None, title="Year-Month"),
            y=alt.Y("count:Q", title="Crime Count"),
            tooltip=["year_month","count"]
        )
        .properties(
            width=600,
            height=300,
            title=f"Monthly Trend - {selected_category}"
        )
    )
    day_map = {0:"Mon",1:"Tue",2:"Wed",3:"Thu",4:"Fri",5:"Sat",6:"Sun"}
    df_week["day_name"] = df_week["day_of_week"].map(day_map)

    week_chart = (
        alt.Chart(df_week)
        .mark_bar()
        .encode(
            x=alt.X("day_name:N", sort=["Mon","Tue","Wed","Thu","Fri","Sat","Sun"]),
            y=alt.Y("count:Q", title="Crime Count"),
            tooltip=["day_name","count"]
        )
        .properties(
            width=300,
            height=300,
            title=f"Weekly Distribution - {selected_category}"
        )
    )

    return month_chart, week_chart
import panel as pn

crime_types_all = ["All"] + sorted(crime_gdf["crime_category"].unique().tolist())

category_selector = pn.widgets.Select(
    name="Crime Category",
    options=crime_types_all,
    value="All"
)

@pn.depends(category_selector.param.value)
def update_charts(selected_category):
    mchart, wchart = create_time_charts(selected_category)
    return pn.Row(
        pn.pane.Vega(mchart, width=650, height=350),
        pn.pane.Vega(wchart, width=350, height=350)
    )

time_dashboard = pn.Column(
    "# Crime Monthly & Weekly Distribution",
    "Select a crime category to see monthly trend & weekly distribution",
    category_selector,
    update_charts
)

time_dashboard.servable()
import holoviews as hv
import hvplot.pandas
hv.extension('bokeh')
crime_gdf.crs = "EPSG:3857"

crime_gdf = crime_gdf[crime_gdf.geometry.notnull()]

crime_gdf["x"] = crime_gdf.geometry.x.astype(float)
crime_gdf["y"] = crime_gdf.geometry.y.astype(float)

crime_gdf = crime_gdf.replace([np.inf, -np.inf], np.nan)
crime_gdf = crime_gdf.dropna(subset=["x","y"])

crime_gdf.geometry.head()
0    POINT (-8362262.529 4865786.288)
1    POINT (-8370515.230 4854664.866)
2    POINT (-8350639.372 4879687.881)
3    POINT (-8358307.404 4868543.262)
4    POINT (-8366984.170 4872476.776)
Name: geometry, dtype: geometry
crime_gdf.head(20)
cartodb_id the_geom the_geom_webmercator objectid dc_dist psa dispatch_date_time dispatch_date dispatch_time hour ... text_general_code point_x point_y geometry year_month crime_category day_of_week hour_of_day x y
0 2 0101000020E6100000A51C8299A5C752C006342AD3DCFF... 0101000020110F0000F80DE2A145E65FC1E5EC7592BE8F... 114 25 3 2023-03-11T17:12:00Z 2023-03-11 12:12:00 12.0 ... Robbery No Firearm -75.119482 39.998927 POINT (-8362262.529 4865786.288) 2023-03 Violent Crime 5 0 -8.362263e+06 4.865786e+06
1 4 0101000020E6100000F9245E3B64CC52C0B7195D940FF6... 0101000020110F00000426B7CE54EE5FC1C5E06D37E284... 116 01 1 2023-03-11T18:31:00Z 2023-03-11 13:31:00 13.0 ... Theft from Vehicle -75.193618 39.922350 POINT (-8370515.230 4854664.866) 2023-03 Property Crime 5 0 -8.370515e+06 4.854665e+06
2 7 0101000020E6100000118A52E7F6C052C0CFF41263190C... 0101000020110F00006728CED7EBDA5FC169DB64F8519D... 119 08 2 2023-03-11T22:13:00Z 2023-03-11 17:13:00 17.0 ... Thefts -75.015070 40.094525 POINT (-8350639.372 4879687.881) 2023-03 Property Crime 5 0 -8.350639e+06 4.879688e+06
3 123 0101000020E6100000E1F9FB7B5FC552C0159C0B6D4A02... 0101000020110F00009D28D4D968E25FC13CD5C3D06F92... 96 15 1 2023-03-11T12:42:00Z 2023-03-11 07:42:00 7.0 ... Thefts -75.083953 40.017896 POINT (-8358307.404 4868543.262) 2023-03 Property Crime 5 0 -8.358307e+06 4.868543e+06
4 126 0101000020E6100000D1CCD5875CCA52C014B723FFC005... 0101000020110F00002F28E30AE2EA5FC10090A3314796... 99 14 1 2023-03-12T00:54:00Z 2023-03-11 19:54:00 19.0 ... Robbery No Firearm -75.161898 40.044952 POINT (-8366984.170 4872476.776) 2023-03 Violent Crime 5 0 -8.366984e+06 4.872477e+06
5 128 0101000020E61000002B7C851E94C252C0FB6A79ABCF03... 0101000020110F000050B036BBA9DD5FC1A20DA3831F94... 101 15 3 2023-03-11T18:53:00Z 2023-03-11 13:53:00 13.0 ... Thefts -75.040290 40.029775 POINT (-8353446.925 4870270.057) 2023-03 Property Crime 5 0 -8.353447e+06 4.870270e+06
6 129 0101000020E6100000C71530E485C852C03A8354C44800... 0101000020110F000023CB499DC2E75FC1CD36223F3690... 102 25 3 2023-03-11T07:03:00Z 2023-03-11 02:03:00 2.0 ... Aggravated Assault Firearm -75.133172 40.002221 POINT (-8363786.458 4866264.986) 2023-03 Violent Crime 5 0 -8.363786e+06 4.866265e+06
7 138 0101000020E6100000E26C2165D7BE52C0EE405BD61608... 0101000020110F000097CD95A350D75FC182830489DE98... 110 08 2 2023-03-11T12:22:00Z 2023-03-11 07:22:00 7.0 ... Theft from Vehicle -74.981897 40.063197 POINT (-8346946.556 4875130.141) 2023-03 Property Crime 5 0 -8.346947e+06 4.875130e+06
8 146 0101000020E6100000ABAA7E4249CF52C0CAFACDC474F6... 0101000020110F000001F690843FF35FC18CE049475285... 284 12 2 2023-02-26T14:40:00Z 2023-02-26 09:40:00 9.0 ... Thefts -75.238846 39.925439 POINT (-8375550.071 4855113.114) 2023-02 Property Crime 6 0 -8.375550e+06 4.855113e+06
9 149 0101000020E6100000DF2A99ADC6CE52C00BF379200DFD... 0101000020110F0000EDBF38B661F25FC1DC5ECECBA08C... 287 19 2 2023-02-26T22:52:00Z 2023-02-26 17:52:00 17.0 ... Thefts -75.230876 39.976963 POINT (-8374662.847 4862595.184) 2023-02 Property Crime 6 0 -8.374663e+06 4.862595e+06
10 152 0101000020E6100000093D93E496CF52C0EDE0D4C575FC... 0101000020110F00007C0FB162C3F35FC1759C000EF98B... 290 19 2 2023-02-26T05:31:00Z 2023-02-26 00:31:00 0.0 ... Theft from Vehicle -75.243585 39.972344 POINT (-8376077.542 4861924.219) 2023-02 Property Crime 6 0 -8.376078e+06 4.861924e+06
11 153 0101000020E6100000D50648B048CB52C0E395DA41DBFC... 0101000020110F000017AE3E2E73EC5FC137E49986698C... 291 22 4 2023-02-26T18:35:00Z 2023-02-26 13:35:00 13.0 ... Theft from Vehicle -75.176312 39.975441 POINT (-8368588.723 4862374.103) 2023-02 Property Crime 6 0 -8.368589e+06 4.862374e+06
12 159 0101000020E6100000E753850E13CC52C01D586D8298FD... 0101000020110F000062125BECCAED5FC1E2F5A9473B8D... 297 22 4 2023-02-26T18:27:00Z 2023-02-26 13:27:00 13.0 ... Theft from Vehicle -75.188663 39.981217 POINT (-8369963.693 4863213.120) 2023-02 Property Crime 6 0 -8.369964e+06 4.863213e+06
13 295 0101000020E6100000DACF9CD4DBCB52C0E03C286A61F7... 0101000020110F00007325B21D6DED5FC17D9F205F5886... 213 17 2 2023-02-26T05:46:00Z 2023-02-26 00:46:00 0.0 ... Aggravated Assault Firearm -75.185292 39.932660 POINT (-8369588.464 4856161.486) 2023-02 Violent Crime 6 0 -8.369588e+06 4.856161e+06
14 298 0101000020E6100000F9EBA1BFC8C952C05482518B39F5... 0101000020110F0000A5891505E7E95FC1D8A22933F583... 216 03 2 2023-02-26T09:18:00Z 2023-02-26 04:18:00 4.0 ... Aggravated Assault No Firearm -75.152878 39.915819 POINT (-8365980.079 4853716.799) 2023-02 Violent Crime 6 0 -8.365980e+06 4.853717e+06
15 300 0101000020E6100000DE731E1DF0CA52C0A35E2A15D8F4... 0101000020110F0000352233BADCEB5FC1FAC9FC478983... 218 03 3 2023-02-26T21:31:00Z 2023-02-26 16:31:00 16.0 ... Theft from Vehicle -75.170905 39.912844 POINT (-8367986.909 4853285.125) 2023-02 Property Crime 6 0 -8.367987e+06 4.853285e+06
16 301 0101000020E61000003AA2CE1E60CB52C09D4C9A0E3604... 0101000020110F00009E3458FB9AEC5FC18ED7CF149194... 219 14 3 2023-02-26T21:23:00Z 2023-02-26 16:23:00 16.0 ... Thefts -75.177742 40.032900 POINT (-8368747.927 4870724.325) 2023-02 Property Crime 6 0 -8.368748e+06 4.870724e+06
17 303 0101000020E61000009D593B1F58C952C0D52F09FED403... 0101000020110F0000A1A859B627E95FC14653F26A2594... 221 35 2 2023-02-27T00:41:00Z 2023-02-26 19:41:00 19.0 ... Thefts -75.146004 40.029938 POINT (-8365214.849 4870293.671) 2023-02 Property Crime 6 0 -8.365215e+06 4.870294e+06
18 322 0101000020E6100000A4A812E9E7CB52C041D910F397F5... 0101000020110F0000B60B8DA281ED5FC1A7D720BD5D84... 240 01 2 2023-02-27T02:22:00Z 2023-02-26 21:22:00 21.0 ... Thefts -75.186030 39.918700 POINT (-8369670.540 4854134.955) 2023-02 Property Crime 6 0 -8.369671e+06 4.854135e+06
19 327 0101000020E6100000812040860EC752C0172A628519FF... 0101000020110F00008ADE100445E55FC132D1110EE68E... 245 24 1 2023-02-26T18:38:00Z 2023-02-26 13:38:00 13.0 ... Thefts -75.110261 39.992966 POINT (-8361236.064 4864920.220) 2023-02 Property Crime 6 0 -8.361236e+06 4.864920e+06

20 rows × 23 columns

import holoviews as hv
import hvplot.pandas
hv.extension('bokeh')
import datashader as ds
import numpy as np

# months & cats
months_sorted = sorted(crime_gdf["year_month"].unique().tolist())
cats_sorted   = sorted(crime_gdf["crime_category"].unique().tolist())

df_hv = crime_gdf[["year_month","crime_category","x","y"]].copy()

# monthly_counts for bar chart
monthly_counts = (
    crime_gdf.groupby(["year_month","crime_category"])
             .size().reset_index(name="count")
)

# color_key for multiple categories
color_key_dict = {
    "Violent Crime": "#e31a1c", 
    "Property Crime": "#1f78b4",
    "Drug/Alcohol Crime":"#33a02c",
    "Sexual Offenses": "#ff7f00",
    "Miscellaneous":   "#6a3d9a",
    "Other": "#b15928"
}

month_selector = pn.widgets.Select(
    name="Month",
    options=months_sorted,
    value=months_sorted[0]
)

crime_selector = pn.widgets.Select(
    name="Crime Category",
    options=["All"] + cats_sorted,
    value="Violent Crime"
)

def create_bar_chart(selected_month, selected_crime):
    if selected_crime == "All":
        chart_df = monthly_counts.groupby("year_month", as_index=False)["count"].sum()
    else:
        chart_df = monthly_counts[monthly_counts["crime_category"]==selected_crime]

    highlight = alt.condition(
        alt.datum.year_month == selected_month,
        alt.value("orange"),  
        alt.value("gray")
    )

    bar = (
        alt.Chart(chart_df)
        .mark_bar()
        .encode(
            x=alt.X("year_month:N", sort=None, title="Month"),
            y=alt.Y("count:Q", title="Crime Count"),
            color=highlight,
            tooltip=["year_month","count"]
        )
        .properties(
            width=600,
            height=250,
            title=f"Monthly Distribution for '{selected_crime}'"
        )
    )
    return bar

@pn.depends(month_selector, crime_selector)
def update_map(selected_month, selected_crime):
    if selected_crime == "All":
        sub_df = df_hv[df_hv["year_month"] == selected_month]
        aggregator = ds.count_cat("crime_category")
        color_args = dict(color_key=color_key_dict)
    else:
        sub_df = df_hv[
            (df_hv["year_month"] == selected_month) &
            (df_hv["crime_category"] == selected_crime)
        ]
        aggregator = ds.count()
        color_args = dict(cmap=["red"])

    if sub_df.empty:
        return hv.Text(0,0,f"No data for {selected_month}, {selected_crime}").opts(
            width=300, height=200
        )

    map_plot = sub_df.hvplot.points(
        x="x",
        y="y",
        crs="EPSG:3857",    
        geo=True,
        tiles="CartoLight", 
        datashade=True,
        dynspread=True,
        aggregator=aggregator,
        width=700,
        height=450,
        project=False,
        **color_args   
    )
    return map_plot

@pn.depends(month_selector, crime_selector)
def combined_dashboard(selected_month, selected_crime):
    map_ = update_map(selected_month, selected_crime)
    bar_chart = create_bar_chart(selected_month, selected_crime)
    bar_pane = pn.pane.Vega(bar_chart, width=650, height=300)
    return pn.Column(map_, bar_pane)


app = pn.Column(
    "# Crime Distribution (Light Gray Map) with Legend & Bar Chart",
    pn.Row(month_selector, crime_selector),
    combined_dashboard
)

app.servable()